def PrintResults(urldata): intradata = get_and_proess_data(urldata) run_algo = EarningsAlgorithm.Algorithm() train_file = df.from_csv("data/intradata_frompython_train.csv", index_col=False, header=0) train_file_output = 'data/intradata_frompython_train_results.csv' # # get an instance of the class run_algo.train_algo(train_file, train_file_output) test_file = df.from_csv("data/intradata_frompython_test.csv", index_col=False, header=0) test_file_output = 'data/intradata_frompython_test_results.csv' agg_signals, past_5_signals, past_5_values, past_5_bayes = run_algo.test_algo(test_file, test_file_output) outputs = [] outputs.append(str(agg_signals)) outputs.append(str(past_5_signals)) outputs.append(str(past_5_values)) outputs.append(str(past_5_bayes))
def run_my_task(self, inputs, settings, outputs): # Get files. fermata_indices_file = inputs['Cadence Indexer - fermata indices (Pandas DataFrame csv)'][0]['resource_path'] infile = inputs['Cadence Indexer - figured bass (Pandas DataFrame csv)'][0]['resource_path'] outfile = outputs['Cadence Indexer - Pandas DataFrame csv'][0]['resource_path'] # De-serialize the DataFrames. fermata_indices = DataFrame.from_csv(fermata_indices_file, header = [0, 1]) # We know the first two rows constitute a MultiIndex figured_bass = DataFrame.from_csv(infile, header = [0, 1]) # We know the first two rows constitute a MultiIndex # Added fermatas to DataFrame. cadence_marker = fermata_indices.apply(lambda x: 'Fermata' in x.values, axis = 1) pieces = {'Basso seguente': figured_bass['Basso seguente']['3'], 'Figured bass': figured_bass['Figured bass'].T.loc[['[0,3 1,3 2,3] (3)']].T, 'Cadence': cadence_marker} figured_bass = concat(pieces, axis = 1) # Find cadences. marker_column = 'Cadence' cadence_size = 4 indices = figured_bass[figured_bass[marker_column][0] == True].index cadences = [] for index in indices: cadenceEndLocation = figured_bass.index.get_loc(index) harmonies = [] for cadenceStep in range(cadenceEndLocation - cadence_size + 1, cadenceEndLocation + 1): harmonies.append(figured_bass.iloc[cadenceStep]) cadence = DataFrame(harmonies) cadences.append(cadence) # Output. self.write_cadences_to_file(cadences, outfile) return True
def main(): print("==== START ====") dataset = DataFrame.from_csv('../data/stats/shots_teams_2013_2014.tsv', sep='\t', index_col=False) current_day = DataFrame.from_csv('../data/stats/shots_players_2015.tsv', sep='\t', index_col=False) target = dataset.loc[:,'goal'] train = dataset.loc[:,['degree', 'distance', 'shot_headed', 'corner']] # For using the model train_target = target train_features = train dataset_test = current_day test_target = dataset_test.loc[:,'goal'] test_features = dataset_test.loc[:,['degree', 'distance', 'shot_headed', 'corner']] dataset_test = dataset_test.reset_index(drop=True) test_target = test_target.reset_index(drop=True) test_features = test_features.reset_index(drop=True) model = LogisticRegression() model = model.fit(train_features, train_target) predicted_probs = model.predict_proba(test_features) predicted_goals = DataFrame(predicted_probs[:,1], columns=['predict']) results = concat([dataset_test, predicted_goals], axis=1) grouped_results = results.groupby(['start', 'name']).sum() grouped_results["count"] = results.groupby(['start', 'name']).size() grouped_results["ratio"] = grouped_results["predict"]/grouped_results["count"] DataFrame(grouped_results).to_csv('../data/stats/exp_goals_players_2015.tsv', sep='\t', encoding='utf-8') print("==== END ====")
def test_to_csv_from_csv1(self): with ensure_clean('__tmp_to_csv_from_csv1__') as path: self.frame['A'][:5] = nan self.frame.to_csv(path) self.frame.to_csv(path, columns=['A', 'B']) self.frame.to_csv(path, header=False) self.frame.to_csv(path, index=False) # test roundtrip self.tsframe.to_csv(path) recons = DataFrame.from_csv(path) assert_frame_equal(self.tsframe, recons) self.tsframe.to_csv(path, index_label='index') recons = DataFrame.from_csv(path, index_col=None) assert(len(recons.columns) == len(self.tsframe.columns) + 1) # no index self.tsframe.to_csv(path, index=False) recons = DataFrame.from_csv(path, index_col=None) assert_almost_equal(self.tsframe.values, recons.values) # corner case dm = DataFrame({'s1': Series(lrange(3), lrange(3)), 's2': Series(lrange(2), lrange(2))}) dm.to_csv(path) recons = DataFrame.from_csv(path) assert_frame_equal(dm, recons)
def __train_on_all_features(filename, up_filename, use_sgd_settings=False): csv_file = __get_filename(NEW_PATH, up_filename) dataframe = DataFrame.from_csv(csv_file, encoding='utf-8') print("Retrieving questions and classification labels...") training_data = dataframe[const.QUESTION_TEXT_KEY].copy() class_labels = dataframe[const.CLASS_LABEL_KEY].copy() print("Starting training of model") file = NEW_PATH + "models" + const.SEPARATOR + FILENAME_START + "_" + up_filename + ".pkl" model = create_and_save_model(training_data, class_labels, file, predict_proba=True, test_size=float(0.2), random_state=0, print_results=True, use_sgd_settings=use_sgd_settings) if model is not None: pipeline_svm = model.best_estimator_ # set up the parameter values param_svm = [ { 'clf__C': [model.best_params_['clf__C']], 'clf__kernel': [model.best_params_['clf__kernel']], }, ] # check if gamma is a part of the parameters if model.best_params_.get('clf__gamma') is not None: param_svm[0]['clf__gamma'] = [model.best_params_.get('clf__gamma')] csv_file = NEW_PATH + const.SEPARATOR + filename + FILE_ENDING dataframe = DataFrame.from_csv(csv_file, encoding='utf-8') print("Retrieving questions and classification labels...") training_data = dataframe[const.QUESTION_TEXT_KEY].copy() class_labels = dataframe[const.CLASS_LABEL_KEY].copy() print("Starting training of model") filename = NEW_PATH + "models" + const.SEPARATOR + filename + ".pkl" create_singular_feature_detector_model(pipeline_svm, param_svm, filename, training_data, class_labels, test_size=float(0.2), random_state=0)
def test_to_csv_from_csv2(self): with ensure_clean('__tmp_to_csv_from_csv2__') as path: # duplicate index df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], columns=['x', 'y', 'z']) df.to_csv(path) result = DataFrame.from_csv(path) assert_frame_equal(result, df) midx = MultiIndex.from_tuples( [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) df = DataFrame(np.random.randn(3, 3), index=midx, columns=['x', 'y', 'z']) df.to_csv(path) result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False) # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it # ? assert_frame_equal(result, df, check_names=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_csv(path, header=col_aliases) rs = DataFrame.from_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) self.assertRaises(ValueError, self.frame2.to_csv, path, header=['AA', 'X'])
def compile_contig_matrices(names = None): if names == None: fs = sh.find(raw_ass, "-name", "*-smds.coverage.percontig").stdout.split("\n")[:-1] fs = [f for f in fs if "_2ref" in f] else: fs = sh.find(merged_ass + names, "-name", "*-smds.coverage.percontig").stdout.split("\n")[:-1] df = DataFrame.from_csv(fs[0],sep="\t") glob = df.ix[:,0:2] for f in fs: if names == None: id = [c for c in f.split("/") if "IH" in c][0] else: id = [c.replace("map_","") for c in f.split("/") if "map_" in c][0] values = DataFrame.from_csv(f,sep="\t")["cov_mean_sample_0"] assert sum([a!=b for a,b in zip(values.index, glob.index)]) == 0 if sum([a!=b for a,b in zip(values.index, glob.index)]) == 0: glob[id] = values else: print f, "is weird" if names == None: glob.to_csv(stats_out + "all_contig_coverages.csv",sep="\t") else: glob.to_csv(stats_out + names + "_contig_coverages.csv",sep="\t") glob[samples].to_csv(stats_out + self.name + "_contig_coverages_for_concoct.csv",sep="\t")
urldata = {} urldata['q'] = ticker = 'SPY' urldata['x'] = 'NYSEARCA' #urldata['x'] = 'NASDAQ' urldata['i'] = 900 urldata['p'] = '15d' # number of past trading days urldata[ 'f'] = 'd,o,h,l,c,v' # requested data d is time, o is open, c is closing, intradata = get_and_proess_data(urldata) run_algo = EarningsAlgorithm.Algorithm() train_file = df.from_csv("data/intradata_frompython_train.csv", index_col=False, header=0) train_file_output = 'data/intradata_frompython_train_results.csv' # # get an instance of the class run_algo.train_algo(train_file, train_file_output) test_file = df.from_csv("data/intradata_frompython_test.csv", index_col=False, header=0) test_file_output = 'data/intradata_frompython_test_results.csv' agg_signals, past_5_signals, past_5_values, past_5_bayes = run_algo.test_algo( test_file, test_file_output)
def __init__(self): self.database = Database(Config(False)) self.features = DataFrame.from_csv('features.csv')
import pandas as pd import matplotlib.pyplot as plt from matplotlib.pyplot import plot, ion, show # interactive ploting from pandas import DataFrame as df Source = "~/Documents/repositories/SalesForecast/Source" ItemKeyName = ['pid', 'size'] items = df.from_csv(path=Source + "/items.csv", sep='|') prices = df.from_csv(path=Source + "/prices.csv", sep='|') sales = df.from_csv(path=Source + "/train.csv", sep='|') # reshape DataFrame # ~ # ---------------------------------------------------------- # column(s) to index items.set_index(['size'], append=True, inplace=True) prices.set_index(['size'], append=True, inplace=True) prices = prices.transpose() # convert data type for index prices = prices.reindex(pd.to_datetime(prices.index)) # cumulatively sold units for each item # data['sum_Times']=data['Times'].groupby(['userID']).cumsum() # know about it sales = sales.sort_values(by=['pid', 'size']) sales.loc[:, 'cumUnits'] = sales.groupby(ItemKeyName)['units'].cumsum() sales.set_index(keys=ItemKeyName, append=True, inplace=True) sales = sales.swaplevel(i='date', j='pid') sales = sales.swaplevel(i='date', j='size') """ reshaped items, prices and sales
pfam_sim = lambda p,q : float(len(p.pfams.intersection(q.pfams)))/max(len(p.pfams),len(q.pfams)) if max(len(p.pfams),len(q.pfams)) != 0 else None pfam_simi = {} for c in tqdm(subset_big_fams.values()): for g1 in tqdm(c): for g2 in c: if not pfam_simi.has_key((g1,g2)): pfam_simi[(g1,g2)] = pfam_sim(g1,g2) for g in tqdm(all_gs): if not os.path.exists(pjoin(g.path, "ref")): bench.tools['BBMap'].make_index(g) DataFrame.from_dict({(k[0].name, k[1].name) : {'pfam_simi' : v} for k,v in pfam_simi.items()}).transpose().to_csv("pfam_simis.csv") ANIs = {} tt = DataFrame.from_csv("ANIs_fams.csv") ANIs = {(t[0],t[1][0]) : {'ANI' : t[1][1], 'coverage' : t[1][2]} for t in tt.iterrows()} for c in tqdm(subset_big_fams.values()): to_compute = set() for g1 in tqdm(c): for g2 in c: if not (g1,g2) in to_compute and not (g2,g1) in to_compute and not (g1.name,g2.name) in ANIs.keys() and not (g2.name,g1.name) in ANIs.keys(): to_compute.add((g1,g2)) data = Parallel(n_jobs=num_cores)(delayed(single_ANI)(i) for i in tqdm(to_compute)) ANIs.update(data) DataFrame.from_dict(ANIs).transpose().to_csv("ANIs_fams.csv")
def get_dataset(self, *args, **kwargs): return DataFrame.from_csv(self.resource.resource_file.path, index_col=None)
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean('__tmp_to_csv_multiindex__') as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) self.assertEqual(frame.index.names, df.index.names) # needed if setUP becomes a classmethod self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = DataFrame.from_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = DataFrame.from_csv(path, index_col=None) self.assertEqual(len(recons.columns), len(tsframe.columns) + 2) # no index tsframe.to_csv(path, index=False) recons = DataFrame.from_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUP becomes classmethod self.tsframe.index = old_index with ensure_clean('__tmp_to_csv_multiindex__') as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2], tupleize_cols=False) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) self.assertTrue(all([x is None for x in result.columns.names])) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) df.to_csv(path, tupleize_cols=True, index=False) result = read_csv(path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[0], tupleize_cols=False) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[0], tupleize_cols=False) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=True) result = read_csv(path, header=0, index_col=[0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path, tupleize_cols=False) for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with assertRaisesRegexp(ParserError, msg): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) # write with cols with assertRaisesRegexp(TypeError, 'cannot specify cols with a ' 'MultiIndex'): df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar']) with ensure_clean('__tmp_to_csv_multiindex__') as path: # empty tsframe[:0].to_csv(path) recons = DataFrame.from_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) self.assertEqual(len(recons), 0)
capitalized_words = r"[A-Z]\w+" print(re.findall(capitalized_words, sample_twitter)) # Split my_string on spaces and print the result spaces = r"\s+" print(re.split(spaces, sample_twitter)) # Find all digits in my_string and print the result digits = r"\d+" re.findall(digits, sample_twitter) from pandas import DataFrame import pandas as pd df = DataFrame.from_csv("movie_info.csv", sep=",") movie_synopsis = df['synopsis'] ####First Step is delete missing values in the review column movie_synopsis_nomissing = [r for r in movie_synopsis if pd.notnull(r)] ####Let use one of the reviews as example sample = movie_synopsis_nomissing[1494] sample ###Transfer to lower cases sample = sample.lower() sample ###Removing numbers
def import_csv(pth): y = DataFrame.from_csv(pth, sep='\t', index_col=0) y = y.fillna('') return y
from pandas import DataFrame, Series import pandas as pd import os import re from selenium import webdriver #from selenium.webdriver.common.by import By #from selenium.webdriver.support.ui import WebDriverWait #from selenium.webdriver.support import expected_conditions as EC from threading import Thread, BoundedSemaphore import threading # from multiprocessing import Pool,cpu_count # from functools import partial df_topics = DataFrame(columns=('Competition', 'Topic', 'URL', 'By', 'Views', 'Replies', 'Score')) df_comp_detailed = DataFrame.from_csv('competitions_detailed.csv', encoding='utf-8') root = 'https://www.kaggle.com' topic_index = 0 Topic_Num = 0 #pool_data = BoundedSemaphore(value=8) pool_html = BoundedSemaphore(value=16) def get_html(url, file_name): global pool_html global Topic_Num pool_html.acquire() #if not os.path.exists(file_name+'.html'): #f = open(file_name+'.html','wb')
try: from bs4 import BeautifulSoup except ImportError: from BeautifulSoup import BeautifulSoup import time, requests, urllib2, webbrowser, os, csv import geocoder, math from pandas import DataFrame #&markers=color:blue%7Clabel:S%7C40.702147,-74.015794 df = DataFrame.from_csv('analysed_with_zips.csv', sep=',', parse_dates=False) #df.insert(11,'ZIP','NA') map_string_list = [] safe_markers = [] unsafe_markers = [] lat_long_list = [] safe_zips = [] unsafe_zips = [] final_string = '' unsafe_info_window_string = [] safe_info_window_string = [] for index, row in df.iterrows(): lat = row[8] lng = row[9] source = row[0] label = row[7] if lat == 'NA' and lng == 'NA': continue
def main(): ################################################# ###### LOADING REVIEW DATA ####### ################################################# start_time = time.clock() print 'Entering the main thread to start the program' data = pd.read_csv('Review_chennai_30000_3_2.csv',sep='|') #print data.head() revs = data.loc[:,['rest_review', 'r_name','reviewtext', 'date']] #print revs.head() #print revs.count() for i in list(np.where(pd.isnull(revs))): revs.drop(revs.index[i], inplace=True) #print revs.count() #print type(revs), revs # clean Ascii Code in the review Text for emoticons and other stuff !!!! revs['text'] = revs['reviewtext'].apply(lambda x: x.decode('unicode_escape').\ encode('ascii', 'ignore').\ strip()) revs['r_name'] = revs['r_name'].apply(lambda x: x.decode('unicode_escape').\ encode('ascii', 'ignore').\ strip()) # clean review text from '|' pipe and |<>:/\@#$%^&*()!~?="'' symbols that are # used as separators in pandas revs['cleanedtext'] = revs['text'].apply(clean_reviewtext_symbols) revs['cleanedtext_dots'] = revs['cleanedtext'].apply(clean_reviewtext_dots) #print 'Cleaned Text \n', revs['cleanedtext'] #print 'With dots removed \n', revs['cleanedtext_dots'] reviews_text = revs['cleanedtext_dots'].str.lower() reviews_text = reviews_text.values res_names = revs['r_name'].values print 'RESTAURANT NAMES = \n', res_names print res_names[0], res_names[-1] score = 1.0 created_dates = revs['date'].values print 'REVIEWED DATES = \n', created_dates print created_dates[0], created_dates[-1] #print type(reviews_text), reviews_text ################################################# ### MODEL LOADING WITHOUT PICKLES ### 0.5 seconds ################################################# global vect global mod start_time = time.clock() # vect = MyVectorizer(min_df = 2, # ngram_range=(1,2)) # # plug _tfidf._idf_diag # vect._tfidf._idf_diag = sp.spdiags(idfs, # diags = 0, # m = len(idfs), # n = len(idfs)) # vocabulary = json.load(open('vocabulary.json', mode = 'rb')) # vect.vocabulary_ = vocabulary start_time = time.clock() with open('Vectorer.pkl', 'rb') as g: vect = cPickle.load(g) with open('Classifier.pkl', 'rb') as g: mod = cPickle.load(g) duration = time.clock() - start_time print 'Time taken to load the models is ', duration ############################################# ### MODEL LOADING WITH PICKLES ### 2.5 seconds ############################################# # timeload = time.clock() # with open('Vect_cPickle_Ngrams.pkl', 'rb') as f: # vect = cPickle.load(f) # with open('Log_Reg_Model_cPickle_Ngrams.pkl', 'rb') as g: # mod = cPickle.load(g) # print 'Time for cPickle loading of models: ', time.clock() - timeload ####################################### #### DATABASE CREATION ##### ####################################### conn = sqlite3.connect('final_30000_3_2.db') print "Opened database successfully" # conn.execute('''CREATE TABLE RATINGS # (ID INT PRIMARY KEY NOT NULL, # RES_NAME TEXT NOT NULL, # RES_ID INT NOT NULL, # DISH_NAME TEXT NOT NULL, # SENTIMENT CHAR(10) NOT NULL)''') # print "Table created successfully" conn.execute('''CREATE TABLE SA_REVIEW_SCORE (ID INT PRIMARY KEY NOT NULL, REVIEW_ID INTEGER NOT NULL, RES_NAME TEXT NOT NULL, KEYWORD_ID INTEGER NOT NULL, DISH_NAME TEXT NOT NULL, SCORE REAL NOT NULL, SENTIMENT CHAR(10) NOT NULL, CREATED_DATE DATE NOT NULL)''') #print "Table created successfully" conn.execute('''CREATE TABLE ASPECTS_SCORE (ID INT PRIMARY KEY NOT NULL, REVIEW_ID INTEGER NOT NULL, RES_NAME TEXT NOT NULL, REVIEW_SENTIMENT TEXT NOT NULL, SERVICE_SENTIMENT TEXT NOT NULL, VALUE_SENTIMENT TEXT NOT NULL, AMBIENCE_SENTIMENT TEXT NOT NULL, FOOD_SENTIMENT TEXT NOT NULL)''') print "Table created successfully" ####### Used for creating dish_list list in dish_list.py ######### # dish_df = pd.read_csv('Food Dishes_SA.csv') # dish_list = dish_df['Table 1'].values # print dish_list # dish_list = [x.lower() for x in dish_list] # print dish_list #print dish_list # dish_list1 = [] # dish_list1 = '\t'.join(dish_list) text2 = """Sushi is amazingly bad. Service is bad. Noodles is awesome. Interiors were badly made. Nigiri is good. Idli is amazing. Aloo gobi is nice. What can i say about Mutton Biriyani? It is bad. The waiters are patient. They are really good.""" text1 = """Waiters are patient. They are also amazing. Biryani is awesome. I would come any day here.""" servicelist = ['service', 'waiter', 'welcome', 'friendly','staff','waitress','bar tender','bartender','chef','people','steward','stewardess','manners'] valuelist = ['value','cheap','cost','price','economical','reasonable','budget','pricey','steep','costly'] ambiencelist = ['place','places','environment','atmosphere','climate','surroundings','look','mood','view','serene','decor','clean','pristine','neat'] foodlist = ['meal', 'lunch', 'dinner', 'brunch','snacks','cuisine','entree','starters','meals','lunches','brunches','entrees'] cp = nltk.RegexpParser(grammar) dish_counter = 0 review_index = 0 nouns_list = [] ambience_sentiment = '#' value_sentiment = '#' food_sentiment = '#' service_sentiment = '#' review_sentiment = '#' for review in reviews_text: dup_dishes = [] review_index += 1 print '########## REVIEW # %d ##############' %(review_index) print ' ' print ' ' print ' ' print ' ' print ' ' print ' ' print ' ' print ' ' print ' ####################################' print "REVIEW = ", review ##### Removing | symbol in the review text ####### #review = review_cleanup(review, unwanted_elements) # review = re.sub('[|]','',review) # print 'Cleaned Review = ', review ##################################################### ###### FULL REVIEW SENTIMENT PREDICTION ###### ##################################################### REVIEWDATA=StringIO("""Review |""" + review) df = DataFrame.from_csv(REVIEWDATA, sep="|", parse_dates=False) print 'FULL REVIEW SENTIMENT PREDICTION GOES ON .......' print df review_bow = vect.transform(df['Review']) pred_review = mod.predict(review_bow) proba_review = mod.predict_proba(review_bow) #print review_bow print pred_review print proba_review if str(pred_review[0]) == '1': result = 'Positive' dict1 = {review : result} score = proba_review[0][1] print '++++++++++++++++++++' print ' The Polarity of the review is ', result else: result = 'Negative' #dict1 = {sent : result} score = proba_review[0][0] print '--------------------' print ' The Polarity of the review is ', result review_sentiment = result print 'Full review sentiment is ', review_sentiment ################################################## ###### SENTENCE SENTIMENT PREDICTION ###### ################################################## sentences = nltk.sent_tokenize(review) sent_index = 0 prp_list_index = [] for sent in sentences: sent_index += 1 #print 'Sentence No: ', sent_index tagged = nltk.pos_tag(nltk.word_tokenize(sent)) #print tagged #nouns = [word for word, pos in tagged if pos in ['NN', 'NNP', 'NNS']] #print 'Nouns = ', nouns #adjectives = [word for word, pos in tagged if pos in ['JJ']] #print 'Adjectives = ', adjectives #adverbs = [word for word, pos in tagged if pos in ['RB', 'RBS']] #print 'Adverbs = ', adverbs ######## COLLECTING SUCCESSIVE NOUNS ######## parsed_content = cp.parse(tagged) dish1 = re.findall(r'NP\s(.*?)/NN\w*', str(parsed_content)) dish2 = re.findall(r'NP\s(.*?)/NN\w*\s(.*?)/NN', str(parsed_content)) dish3 = re.findall(r'NP\s(.*?)/NN\w*\s(.*?)/NN\w*\s(.*?)/NN', str(parsed_content)) dish4 = re.findall(r'NP\s(.*?)/NN\w*\s(.*?)/NN\w*\s(.*?)/NN\w*\s(.*?)/NN', str(parsed_content)) print parsed_content nouns = [] dlist = [] if len(dish4) != 0: for t in dish4: noun = ' '.join(item for item in t) nouns.append(noun) nouns_list.append(noun) #print 'Nouns after 1st iteration ', nouns dlist = '\t'.join(nouns) if len(dish3) != 0: for t in dish3: noun = ' '.join(item for item in t) if noun not in dlist: nouns.append(noun) nouns_list.append(noun) #print 'Nouns after 1st iteration ', nouns dlist = '\t'.join(nouns) if len(dish2) != 0: for t in dish2: noun = ' '.join(item for item in t) if noun not in dlist: nouns.append(noun) nouns_list.append(noun) #print 'Nouns after 2nd iteration ', nouns dlist = '\t'.join(nouns) if len(dish1) != 0: for t in dish1: if t not in dlist: nouns.append(t) nouns_list.append(t) print 'Nouns after last iteration ', nouns # prps = [word for word, pos in tagged if pos in ['PRP']] # print 'PRPs = ', prps ### #### ### ### ##### Including NamedEntity # namedEnt = nltk.ne_chunk(tagged, binary=True) # print 'NamedEnt = ', namedEnt #namedEnt.draw() # for noun in nouns: # print noun, type(noun) flagService = False flagDish = False flagAmbience = False flagValue = False flagFood = False for noun in nouns: ans = getKeyFromDictionary(noun,dish_dict) #print 'Printing ans = \n', ans if ans and len(ans) > 3: flagDish = True if noun in servicelist: flagService = True if noun in ambiencelist: flagAmbience = True if noun in valuelist: flagValue = True if noun in foodlist: flagFood = True else: pass #print 'Flags : ' , flagAmbience #print 'Flag = ', flag if flagDish: #print 'predicting sentiment for sentence', sent ############################ ### PREDICTING SENTIMENT ### ############################ test_time = time.clock() TESTDATA=StringIO("""Review |""" + sent) df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False) #print df1 test_time = time.clock() test_bow = vect.transform(df1['Review']) #print 'Data frame creation time = ', time.clock()-test_time prediction = mod.predict(test_bow) probability = mod.predict_proba(test_bow) #print 'Score = ', score , type(score), score[0][1] #print prediction, type(str(prediction[0])) if str(prediction[0]) == '1': result = 'Positive' dict = {sent : result} score = probability[0][1] #print '++++++++++++++++++++' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('posres.html', result = dict) else: result = 'Negative' dict = {sent : result} score = probability[0][0] #print '--------------------' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('negres.html', result = dict) #print 'inserting noun,prediction,res_name,res_id,review_id into sqlite3 db' #print 'creating a list & jsonify it as output' #print 'Keyword %s is %s' % (noun, result) #cur.execute("INSERT INTO Contacts VALUES (?, ?, ?, ?);", (firstname, lastname, phone, email)) for noun in nouns: ans = getKeyFromDictionary(noun,dish_dict) if ans and len(ans) > 3 and ans not in dup_dishes: dish_counter += 1 #dishes #conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result)) conn.execute("INSERT INTO SA_REVIEW_SCORE VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (dish_counter, review_index,res_names[review_index-1], dish_counter, ans, score, result, created_dates[review_index-1])) print 'Noun = %s & Sentiment = %s ' %(ans, result) dup_dishes.append(ans) else: pass #conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result)) conn.commit() #print "Records created successfully"; if flagService: #print 'predicting sentiment for sentence', sent ############################ ### PREDICTING SENTIMENT ### ############################ test_time = time.clock() TESTDATA=StringIO("""Review |""" + sent) df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False) #print df1 test_time = time.clock() test_bow = vect.transform(df1['Review']) #print 'Data frame creation time = ', time.clock()-test_time prediction = mod.predict(test_bow) probability = mod.predict_proba(test_bow) #print 'Score = ', score , type(score), score[0][1] #print prediction, type(str(prediction[0])) if str(prediction[0]) == '1': serviceresult = 'Positive' #print '++++++++++++++++++++' print ' The Polarity of the review is ', serviceresult # print 'Time to predict the review = ', time.clock() - timet # return render_template('posres.html', result = dict) else: serviceresult = 'Negative' print ' The Polarity of the review is ', serviceresult if serviceresult == 'Positive': service_sentiment = '1' elif serviceresult == 'Negative': service_sentiment = '0' else: service_sentiment = '#' #print '--------------------' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('negres.html', result = dict) if flagFood: #print 'predicting sentiment for sentence', sent ############################ ### PREDICTING SENTIMENT ### ############################ test_time = time.clock() TESTDATA=StringIO("""Review |""" + sent) df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False) #print df1 test_time = time.clock() test_bow = vect.transform(df1['Review']) #print 'Data frame creation time = ', time.clock()-test_time prediction = mod.predict(test_bow) probability = mod.predict_proba(test_bow) #print 'Score = ', score , type(score), score[0][1] #print prediction, type(str(prediction[0])) if str(prediction[0]) == '1': foodresult = 'Positive' #print '++++++++++++++++++++' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('posres.html', result = dict) else: foodresult = 'Negative' if foodresult == 'Positive': food_sentiment = '1' elif foodresult == 'Negative': food_sentiment = '0' else: food_sentiment = '#' #print '--------------------' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('negres.html', result = dict) if flagAmbience: print 'Inside Ambience' #print 'predicting sentiment for sentence', sent ############################ ### PREDICTING SENTIMENT ### ############################ test_time = time.clock() TESTDATA=StringIO("""Review |""" + sent) df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False) #print df1 test_time = time.clock() test_bow = vect.transform(df1['Review']) #print 'Data frame creation time = ', time.clock()-test_time prediction = mod.predict(test_bow) probability = mod.predict_proba(test_bow) #print 'Score = ', score , type(score), score[0][1] #print prediction, type(str(prediction[0])) if str(prediction[0]) == '1': ambienceresult = 'Positive' #print '++++++++++++++++++++' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('posres.html', result = dict) else: ambienceresult = 'Negative' if ambienceresult == 'Positive': #print 'Positive' ambience_sentiment = '1' print ambience_sentiment elif ambienceresult == 'Negative': #print 'Negative' ambience_sentiment = '0' print ambience_sentiment else: ambience_sentiment = '#' print ambience_sentiment #print '--------------------' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('negres.html', result = dict) if flagValue: #print 'predicting sentiment for sentence', sent ############################ ### PREDICTING SENTIMENT ### ############################ test_time = time.clock() TESTDATA=StringIO("""Review |""" + sent) df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False) #print df1 test_time = time.clock() test_bow = vect.transform(df1['Review']) #print 'Data frame creation time = ', time.clock()-test_time prediction = mod.predict(test_bow) probability = mod.predict_proba(test_bow) #print 'Score = ', score , type(score), score[0][1] #print prediction, type(str(prediction[0])) if str(prediction[0]) == '1': valueresult = 'Positive' #print '++++++++++++++++++++' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('posres.html', result = dict) else: valueresult = 'Negative' if valueresult == 'Positive': value_sentiment = '1' elif valueresult == 'Negative': value_sentiment = '0' else: value_sentiment = '#' #print '--------------------' #print ' The Polarity of the review is ', result # print 'Time to predict the review = ', time.clock() - timet # return render_template('negres.html', result = dict) #print 'inserting noun,prediction,res_name,res_id,review_id into sqlite3 db' #print 'creating a list & jsonify it as output' #print 'Keyword %s is %s' % (noun, result) #cur.execute("INSERT INTO Contacts VALUES (?, ?, ?, ?);", (firstname, lastname, phone, email)) if noun in servicelist or noun in ambiencelist or noun in valuelist or noun in foodlist: #if noun in ambiencelist: dish_counter += 1 #dishes #conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result)) print 'Inside aspects_score printing module' print ' ' print ' ' print ambience_sentiment, value_sentiment conn.execute("INSERT INTO ASPECTS_SCORE VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (dish_counter, review_index,res_names[review_index-1], review_sentiment, service_sentiment, value_sentiment, ambience_sentiment, food_sentiment)) # print 'Noun = %s & Sentiment = %s ' %(noun, result) # dup_dishes.append(noun) else: pass #conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result)) conn.commit() #print "Records created successfully"; cursor = conn.execute("SELECT * from SA_REVIEW_SCORE") print 'Printing Stored values in SA_REVIEW_SCORE table' print "REVIEW_ID \t\t RES_NAME \t\t KEYWORD_ID \t\t DISH_NAME \t\t SCORE \t\t SENTIMENT \t\t CREATED_DATE " for row in cursor: # print "REVIEW_ID = ", row[1] # print "RES_NAME = ", row[2] # print "KEYWORD_ID = ", row[3] # print "DISH_NAME = ", row[4] # print "SCORE = %3.2f" % (row[5]) # print "SENTIMENT = ", row[6] # print "CREATED_DATE = ", row[7] ,"\n" print "%d \t\t %s \t\t %d \t\t %s \t\t %3.2f \t\t %s \t\t %s" % (row[1],row[2],row[3],row[4],row[5],row[6],row[7]) cursor1 = conn.execute("SELECT * from ASPECTS_SCORE") print 'Printing Stored values in ASPECTS_SCORE table' print "REVIEW_ID \t\t RES_NAME \t\t REVIEW_SENTIMENT \t\t SERVICE_SENTIMENT \t\t VALUE_SENTIMENT \t\t AMBIENCE_SENTIMENT \t\t FOOD_SENTIMENT " for row in cursor1: # print "REVIEW_ID = ", row[1] # print "RES_NAME = ", row[2] # print "KEYWORD_ID = ", row[3] # print "DISH_NAME = ", row[4] # print "SCORE = %3.2f" % (row[5]) # print "SENTIMENT = ", row[6] # print "CREATED_DATE = ", row[7] ,"\n" print "%d \t\t %s \t\t %s \t\t %s \t\t %s \t\t %s \t\t %s" % (row[1],row[2],row[3],row[4],row[5],row[6],row[7]) conn.close() print 'Program exiting.....' print 'Total time taken: ', time.clock() - start_time, ' seconds' return 0
redosage = re.compile(r'\d+') dosage = [int(redosage.findall(name)[0]) for name in names] # finds 'unite' reunite = re.compile(r'microgrammes|µg|mg|c|g') unite = [reunite.findall(name)[0] for name in names] # finds 'forme' reforme = re.compile(r'comprimé sécable') forme = [reforme.findall(name)[0] for name in names] ############################################################################### # part 2 ############################################################################### # load base base = DataFrame.from_csv( '/home/roms/Telecom/P1/Kit big data/Work/MEDICAM 2008-2013-AMELI clean.csv', header=0, sep=',') # medicaments derembourses en 2013 derembourse = base[(base['Montant remboursé 2012'] != "0") & (base['Montant remboursé 2013'] == "0")]['NOM COURT'] derembourse.to_csv('derembourse2013.csv') # medicaments rembourses en 2013 rembourse = base[(base['Montant remboursé 2012'] == "0") & (base['Montant remboursé 2013'] != "0")]['NOM COURT'] rembourse.to_csv('rembourse2013.csv')
import csv import bisect import sklearn from sklearn.externals import joblib from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score from sklearn.metrics import f1_score, confusion_matrix, cohen_kappa_score from sklearn import tree from sklearn import svm, datasets, cross_validation from sklearn import preprocessing #imblearn module provides several ways to deal with the imbalance in data from imblearn.over_sampling import SMOTE #load the testdata df_test_data = DataFrame.from_csv("test_data.csv", index_col=False) df_test_target = DataFrame.from_csv("test_target.csv", index_col=False) #print the shape of dataframes print df_test_data.shape print df_test_target.shape[0] def test(): #load the classifier clf = joblib.load('model/random_clf.pkl') #predict on the test data pred = clf.predict(df_test_data) #printing the evaluation metrices #accuracy can't be used as measure of model hence metrics like recall, precision, kappa are also measured
0] + '_' + pop_size + splitext(codes_hd5_filename)[1] idx_hd5_filename = splitext(idx_hd5_filename)[ 0] + '_' + pop_size + splitext(idx_hd5_filename)[1] query_args = {'dbname': dbname} if args['psql_host'] is not None: query_args['host'] = args['psql_host'] if args['psql_password'] is not None: query_args['password'] = args['psql_password'] ############# # Population extraction data = None if (args['extract_pop'] == 0 | (args['extract_pop'] == 1)) & isfile( os.path.join(outPath, static_filename)): data = DataFrame.from_csv(os.path.join(outPath, static_filename)) data = sanitize_df(data, static_data_schema) """ data['admission_type'] = data['admission_type'].astype('category') data['gender'] = data['gender'].astype('category') data['first_careunit'] = data['first_careunit'].astype('category') data['ethnicity'] = data['ethnicity'].astype('category') data['intime'] = pd.to_datetime(data['intime']) #, format="%m/%d/%Y")) data['outtime'] = pd.to_datetime(data['outtime']) data['admittime'] = pd.to_datetime(data['admittime']) data['dischtime'] = pd.to_datetime(data['dischtime']) data['deathtime'] = pd.to_datetime(data['deathtime']) """ elif (args['extract_pop'] == 1 & (not isfile(os.path.join(outPath, static_filename)))) | (
def cal_fun(bam_path, bed_file): ''' Using a input bed file and a bam file, cal each pos cov_info from the intervals in bed. :param bam_path: :param bed_file: :return: writing the cov_info to file besides the bam file, with 1-coordinate. ''' bed_file = df.from_csv(bed_file, index_col=False, sep='\t') try: bamfile = pysam.AlignmentFile(bam_path, 'rb') except: print("bamefile need to cal doesn't exist") raise IOError fastafile = pysam.FastaFile(filename=REF_file_path) ####define each file path. result = 'Gene\tChr\tPosistion\tReference\tbase\tA\tC\tG\tT\tA_Rate\tC_Rate\tG_Rate\tT_Rate\t1\t2\t3\t4\n' #define columns. for i in range(len(bed_file)): # iterator Chr = bed_file.iloc[i, 0] start = min(int(bed_file.iloc[i, 2]), int(bed_file.iloc[i, 1])) end = max(int(bed_file.iloc[i, 2]), int(bed_file.iloc[i, 1])) Gene_name = str(bed_file.iloc[i, 3]) #fetch basic info. coverage_ACGT = bamfile.count_coverage( Chr, start - 1, end, read_callback='nofilter', quality_threshold=0) # from 1- to 0- base_counter = dict(zip(['A', 'C', 'G', 'T'], coverage_ACGT)) # make it a dict according preset order. ref_base_str = fastafile.fetch(Chr, start - 1, end) #need to input 0- start/end. if sum(sum(j) for j in coverage_ACGT) != 0: # if this intervals doesn't have any reads, we ignore it. for base_n in range(start, end + 1): n_read = int( sum([ k[range(start, end + 1).index(base_n)] for k in coverage_ACGT ])) # total base num if n_read == 0: continue #if position here didn't have any base, then pass this pos. result += Gene_name + '\t' + Chr + '\t' + str( base_n) + '\t' + ref_base_str[range( start, end + 1).index(base_n)] + '\t' + '\t' for base in ['A', 'C', 'G', 'T']: result += str(base_counter[base][range( start, end + 1).index(base_n)]) + '\t' #write the A/T/G/C num. for base in ['A', 'C', 'G', 'T']: result += str( round( float(base_counter[base][range( start, end + 1).index(base_n)]) / n_read, 4)) + '\t' # wirte the rate of A/T/C/G RATE little_rank_list = [(each_one[1][range(start, end + 1).index(base_n)], each_one[0]) for each_one in base_counter.items()] #construct a list to sort A/T/C/G each base num. result += '\t'.join([ ranked_base[1] for ranked_base in sorted(little_rank_list, reverse=True) ]) + '\n' #make it columns and tag a '\n' sign. if result[-1:] != '\n': result += '\n' #in case the last base didn't have any base, it will continue, so need to check the last sign. else: pass with open(bam_path.partition('.')[0] + '_cov.info', 'w') as f1: f1.write(result) print('Cal cov info complete')
# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame df = DataFrame.from_csv("output.dat", sep="\t") values = np.transpose(df.values) fig, ax = plt.subplots() origin = "lower" pcm = ax.imshow(np.abs(values), cmap='bone_r', origin=origin, aspect="auto") fig.colorbar(pcm, ax=ax) plt.show()
def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize, tupleize_cols=False) recons = DataFrame.from_csv(path, tupleize_cols=False, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = DataFrame.from_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [ recons.iloc[:, i].values for i in range(rnlvl - 1) ] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array(lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array(list( map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array(list( map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, to_datetime( recons.columns)), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True)
def main(isLoadData=1, isCutData=0, PieceLength=500, isLoadFeatures=1, isGetFeaturesNaNs=0, isLoadLabels=1, LabelFileName={}, FeatureMethod='Quantization', LabelBy='PANSS'): # -- TODO: # -- # # -- # make sure it works for len(FeatureMethod)>1, now only uses FeatureTypeList[0] os.system('cls') ## Construct / load DATA object DataPath = resultsPath + '\\LearningData' if isLoadData: print('loading DATA from ' + DataPath + '...') dataObject = pickle.load( open(os.path.join(DataPath, 'DATAraw.pickle'), 'rb') ) #TODO change 'raw' to PieceLength variable and make sure it loads the cutted data else: AllAUs = [ 'TimeStamps', 'TrackingSuccess', 'au1', 'au2', 'au3', 'au4', 'au5', 'au6', 'au7', 'au8', 'au9', 'au10', 'au11', 'au12', 'au13', 'au14', 'au15', 'au16', 'au17', 'au18', 'au19', 'au20', 'au21', 'au22', 'au23', 'au24', 'au25', 'au26', 'au27', 'au28', 'au29', 'au30', 'au31', 'au32', 'au33', 'au34', 'au35', 'au36', 'au37', 'au38', 'au39', 'au40', 'au41', 'au42', 'au43', 'au44', 'au45', 'au46', 'au47', 'au48' ] GoodTrackableAUs = [ 'au17', 'au18', 'au19', 'au1', 'au22', 'au25', 'au26', 'au27', 'au28', 'au29', 'au2', 'au30', 'au31', 'au32', 'au33', 'au34', 'au37', 'au41', 'au43', 'au45', 'au47', 'au48', 'au8' ] PartNames = 'Interview' isQuantize = True isCutData = True #print('fs-signal: ' + GoodTrackableAUs) print('Part: ' + PartNames) #print('isQuantize=' + str(isQuantize)) isSetDataParams = int(raw_input('reset data params? ')) if isSetDataParams: GoodTrackableAUs = raw_input('set fs-signal (as list): ') PartNames = raw_input( 'set Part name (as str, capital first letter): ') dataObject = DataObject(PartNames, VarNames=GoodTrackableAUs) dataObject.getQuantize() pickle.dump(dataObject, open(os.path.join(resultsPath, 'DATA'), 'wb')) if isCutData: isQuantize = 1 #raw_input('set isQuantize to: ') print('constructing Data Object...') print('cutting raw data') dataObject.rawDF = dataObject.cutData(dataObject.rawDF, PieceLength) print('cutting quantized data') dataObject.quantizedDF = dataObject.cutData(dataObject.quantizedDF, PieceLength) isSaveData = 1 #int(raw_input('save cutted data? ')) if isSaveData: saveName = os.path.join(resultsPath, 'LearningData', 'DATA_' + str(PieceLength)) #dataObject.rawDF.to_csv(saveName+'rawDF.csv') #dataObject.quantizedDF.to_csv(saveName+'quantizedDF.csv') pickle.dump(dataObject, open(saveName + '.pickle', 'wb')) ## Calc / Load FEATURES for learning FeaturesPath = resultsPath + '\\LearningFeatures\\' + FeatureMethod + '_Features_' + str( PieceLength) Features = FeatureObject(dataObject, FeaturesPath, PieceLength) if isLoadFeatures: print('loading FEATURES from ' + FeaturesPath + '...\n') Features.FeaturesDF = read_csv(FeaturesPath + 'DF.csv', index_col=[0, 1], skipinitialspace=True, header=[0, 1]) Features.method = FeatureMethod else: if not FeatureMethod: FeatureMethod = raw_input( "Enter Feature Type ('Quantization', Moments') as list: ") print("Calculating subjects' " + FeatureMethod + " features ...") Features.getFeatures(FeatureMethod) if isGetFeaturesNaNs: Features.FeaturesDF = featuresUtils.getMissingFeatures(Features) Features.FeaturesDF.to_csv(Features.FeaturesPath + 'DF.csv') # Set /Load LABELS for Learning LabelsPath = resultsPath + '\\LearningLabels\\' + LabelBy + '_Labels' #for loading / saving LabelsPath2 = LabelsPath + '2' if isLoadLabels: print('loading LABELS from ' + LabelsPath + '...\n') Labels = pickle.load(open(LabelsPath + ".pickle", 'rb')) Labels2 = Labels #pickle.load(open(LabelsPath2+".pickle",'rb')) #todo - change this when there is second labeled data (from michael) else: Labels = LabelObject(SubjectsDetailsDF, LabelsPath) Labels.getLabels(LabelBy) SubjectsDetailsDF2 = DF.from_csv( 'C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\SubjectsDetailsDF2-fill with data from michael.csv' ) Labels2 = LabelObject(SubjectsDetailsDF2, LabelsPath2) Labels2.getLabels(LabelBy) #Labels.permLabels() #TODO - move this to "not isLoad" or somewhere else. # Get cross validation learning results : # loop over feature number FeatureRange = [10] #range(1,6)#range(1,50,5) #[6] #init isBoolLabel = Labels.isBoolLabel FeatureComparession = {} SelectedFeaturesComparession = {} newDF = lambda: DF(columns=FeatureRange, index=Labels.names) if isBoolLabel: All_specificity = newDF() All_sensitivity = newDF() All_precision = newDF() All_accuracy = newDF() All_f1 = newDF() All_ss_mean = newDF() else: All_trainR = newDF() All_trainPval = newDF() All_trainErr = newDF() All_testR = newDF() All_testPval = newDF() All_testErr = newDF() All_testErrStd = newDF() All_LabelRange = newDF() ModelList = ['ridge'] #['regression','ridge','lasso'] FeatureSelectionList = ['PCA'] #,'KernelPCA','SparsePCA','ICA'] for m in ModelList: print('************************************ Model = ' + m + '************************************') for fs in FeatureSelectionList: print('***************************** FeatureSelection = ' + fs + '******************************') for f in FeatureRange: print('Num Of Features = ' + str(f)) s = LearnObject(Features, Labels, Labels2) s.run(Model=m, n_features=f, isSavePickle=0, isSaveCsv=1, isSaveFig=1, isPerm=0, isBetweenSubjects=True, FeatureSelection=fs) LabelNameList = s.ResultsDF.columns #TODO - CHANGE THIS! for label in LabelNameList: print(label) if f == FeatureRange[0]: FeatureComparession[label] = DF( columns=FeatureRange, index=s.ResultsDF.index) SelectedFeaturesComparession[label] = DF( columns=FeatureRange, index=s.BestFeatures.index) FeatureComparession[label][f] = s.ResultsDF[label] SelectedFeaturesComparession[label][f] = s.BestFeatures[ label] r = s.ResultsDF[label] if isBoolLabel: All_specificity[f].loc[label] = r['specificity'] All_sensitivity[f].loc[label] = r['sensitivity'] All_precision[f].loc[label] = r['precision'] All_accuracy[f].loc[label] = r['accuracy'] All_f1[f].loc[label] = r['f1'] All_ss_mean[f].loc[label] = r['ss_mean'] else: All_trainR[f].loc[label] = r['trainR^2'] All_trainPval[f].loc[label] = r['trainPval'] All_trainErr[f].loc[label] = r['trainError'] All_testR[f].loc[label] = r['testR^2'] All_testPval[f].loc[label] = r['testPval'] All_testErr[f].loc[label] = r['testError'] All_testErrStd[f].loc[label] = r['testErrorStd'] All_LabelRange[f].loc[label] = r['LabelRange'] for label in LabelNameList: saveName = s.Learningdetails[ 'saveDir'] + '\\' + label + '_ResultsSummary.csv' if os.path.exists(saveName): isSave = raw_input( 'the file ' + saveName + ' already exist, \noverwrite existing file? ') else: isSave = 1 if isSave: resultsSum = concat([ DF(index=['----------- Learning results -----------']), FeatureComparession[label], DF(index=['-------Selected Features Analysis-------']), SelectedFeaturesComparession[label], DF(index=['----------- Learning details -----------']), DF.from_dict(s.Learningdetails, orient='index') ]) if s.isDecompose: resultsSum = concat([resultsSum, s.LabelComponents[label]]) resultsSum.to_csv(saveName) if isBoolLabel: ResultsSummary = concat([ DF(index=['------specificity vs. Number Of Features-------']), All_specificity, DF(index=['------sensitivity vs. Number Of Features-------']), All_sensitivity, DF(index=['------precision vs. Number Of Features-------']), All_precision, DF(index=['------accuracy vs. Number Of Features-------']), All_accuracy, DF(index=['------f1 vs. Number Of Features-------']), All_f1, DF(index=[ '------sensitivity-specificity mean vs. Number Of Features-------' ]), All_ss_mean ]) ResultsSummary.to_csv(s.Learningdetails['saveDir'] + '\\ResultsSummary_bool.csv') else: ResultsSummary = concat([ DF(index=['------train R^2 vs. Number Of Features-------']), All_trainR.dropna(), DF(index=['------train Pval vs. Number Of Features-------']), All_trainPval.dropna(), DF(index=['------train Error vs. Number Of Features-------']), All_trainErr.dropna(), DF(index=['------test R^2 vs. Number Of Features-------']), All_testR.dropna(), DF(index=['------testPval vs. Number Of Features-------']), All_testPval.dropna(), DF(index=[ '------test test Error vs. Number Of Features-------' ]), All_testErr.dropna(), DF(index=[ '------test Error STD vs. Number Of Features-------' ]), All_testErrStd.dropna(), DF(index=['------Label Range vs. Number Of Features-------']), All_LabelRange.dropna() ]) ResultsSummary.to_csv(s.Learningdetails['saveDir'] + '\\ResultsSummary_regression.csv') # permutation test: """ #init
import pandas as pd from pandas import DataFrame df_tennis = DataFrame.from_csv('tennis.csv') print("\n Given Play Tennis Data Set:\n\n", df_tennis)
y = line.split(',') y.pop() #pop off /r/n if not '' in y: formatted.writelines(line) else: continue i = i + 1 except: break fpr.close() formatted.close() #NOW FORMATTED AS CSV #Extract csv into DataFrame DF = df.from_csv(new, sep=',', index_col=None) #read CSV and format #fpr = open(filename,'r') #fwp = open(outname,'w') #Fin = csv.reader(fp) #dex = ['Date','Time'] #DF.set_index(dex) length = len(DF.index) / 3 DF3 = df(index=range(length), columns=out_columns) for i in range(length): j = 3 * i
""" Created on Tue Sep 24 11:34:04 2019 @author: Soumaya """ import pandas as pd from pandas import DataFrame df_irisbd = DataFrame.from_csv(r"iris.data", header=None, index_col=None) print(df_irisbd) X = df_irisbd.iloc[:, :-1].values y = df_irisbd.iloc[:, 4].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=100) print(y_test) from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=3) # Train the model using the training sets model.fit(X_train, y_train) predicted = model.predict(X_test) # 0:Overcast, 2:Mild from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, predicted)) print(classification_report(y_test, predicted))
# coding: utf-8 import numpy as np import pandas as pd from pandas import DataFrame as df from cylp.cy import CyClpSimplex from cylp.py.modeling.CyLPModel import CyLPArray, CyLPModel from Single_Year_Stage_II import Single_Year_Stage_II # Import PUF, and results from Stage I puf = pd.read_csv("/Users/Amy/Documents/puf.csv") Stage_I_factors = df.from_csv("Stage_I_factors.csv", index_col=0) Stage_II_targets = df.from_csv("Stage_II_targets.csv", index_col=0) # all the final weights would be saved in z length = len(puf.s006) z = np.empty([length, 17]) z[:, 0] = puf.s006 / 100 # running LP solver for each year, with tolerance given in the tol argument z[:, 1] = Single_Year_Stage_II(puf, Stage_I_factors, Stage_II_targets, year='2009', tol=0.24) z[:, 2] = Single_Year_Stage_II(puf, Stage_I_factors, Stage_II_targets, year='2010',
import json import requests import pandas as pd from pandas import DataFrame #Search using column names in LOV #save results to files #the code need to be run for each dataframe df = DataFrame.from_csv('geneData\\genes.tsv', sep='\t') #pharmgkb # df = DataFrame.from_csv('geneData\\hgnc_complete_subset.tsv', sep='\t') #hgnc # df = DataFrame.from_csv('geneData\\CTD_chem_gene_ixns.tsv', sep='\t') #ctd # df = pd.read_csv('geneData\\genage_human.csv') cols = df.columns.values #(saved_column) # dfres = pd.DataFrame(columns = ['column_name', 'results', 'no_results', 'vocab_list']) vocab_list = [] results_list = [] list_dict = [] for col in cols: # if "_" in col: # col=col.replace("_"," ") parameters = {"q": col, "tag": "Biology", "type": "property"} # print(col) # Make a get request with the parameters. response = requests.get( "https://lov.linkeddata.es/dataset/lov/api/v2/term/search?", params=parameters) parsed = json.loads(response.content) # print(response.content) try:
def read_comment_from_file(path='..\data\\test'): comments_df = DataFrame.from_csv(path, header=0, sep='\t') comments_df.reset_index(inplace=True) return comments_df
from pandas import DataFrame import math dfH = DataFrame.from_csv('geneData\\hgnccom.csv',index_col=None) #hgnc dfg = DataFrame.from_csv('geneData\\genes.tsv', sep='\t',index_col=None) #pharmgkb #for first approach the numbers from unique sum need to be eliminated ############################## loc from HGNC file d1_l = dfH['location'].tolist() d2_l=dfH['location_sortable'].tolist() d1_lc =[] d2_lc=[] for i in d1_l: if i not in d1_lc: d1_lc.append(i) for j in d2_l : if j not in d2_lc: d2_lc.append(j) uniq=[] for x in d1_lc: if x not in uniq: uniq.append(x) for y in d2_lc: if y not in uniq: uniq.append(y) # print(len(uniq)) # print(len(d1_lc))
from nltk.stem import WordNetLemmatizer from gensim import corpora, models, utils # Fields in data set # "id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate" pd.options.display.float_format = '{:,.0f}'.format regex = re.compile('[^a-zA-Z\']') stopwords = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() Trainfile = "C:\\Kaggle\\train.txt" df = DataFrame.from_csv(Trainfile, sep='\t', header=0,index_col=None) comments = df.iloc[:,:8] #print(df.info()) def perform_lsi(corpus, num_topic, dictionary): tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics= num_topic) print(lsi.print_topics(num_topics=num_topic, num_words=20)) def perform_lda(corpus, num_topic, dictionary): lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topic) print(lda.print_topics(num_topics=num_topic, num_words=20))
def hapmap3(data_set='hapmap3'): try: from pandas import read_pickle, DataFrame from sys import stdout import bz2 except ImportError as i: raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" if not data_available(data_set): download_data(data_set) dirpath = os.path.join(data_path, 'hapmap3') hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ ['.snps.pickle', '.info.pickle', '.nan.pickle']] if not reduce(lambda a, b: a and b, map(os.path.exists, preprocessed_data_paths)): if not overide_manual_authorize and prompt_user( "Preprocessing requires 17GB " "of memory and can take a long time, continue? [Y/n]\n"): print "Preprocessing required for further usage." return status = "Preprocessing data, please be patient..." print status def write_status(message, progress, status): stdout.write(" " * len(status)) stdout.write("\r") stdout.flush() status = r"[{perc: <{ll}}] {message: <13s}".format( message=message, ll=20, perc="=" * int(20. * progress / 100.)) stdout.write(status) stdout.flush() return status unpacked_files = [ os.path.join(dirpath, hapmap_file_name + ending) for ending in ['.ped', '.map'] ] if not reduce(lambda a, b: a and b, map(os.path.exists, unpacked_files)): status = write_status('unpacking...', 0, '') curr = 0 for newfilepath in unpacked_files: if not os.path.exists(newfilepath): filepath = newfilepath + '.bz2' file_size = os.path.getsize(filepath) with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f: decomp = bz2.BZ2Decompressor() file_processed = 0 buffsize = 100 * 1024 for data in iter(lambda: f.read(buffsize), b''): new_file.write(decomp.decompress(data)) file_processed += len(data) write_status( 'unpacking...', curr + 12. * file_processed / (file_size), status) curr += 12 status = write_status('unpacking...', curr, status) status = write_status('reading .ped...', 25, status) # Preprocess data: snpstrnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.ped', dtype=str) status = write_status('reading .map...', 33, status) mapnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.map', dtype=str) status = write_status('reading relationships.txt...', 42, status) # and metainfo: infodf = DataFrame.from_csv('./relationships_w_pops_121708.txt', header=0, sep='\t') infodf.set_index('IID', inplace=1) status = write_status('filtering nan...', 45, status) snpstr = snpstrnp[:, 6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) inan = snpstr[:, :, 0] == '0' status = write_status('filtering reference alleles...', 55, status) ref = np.array( map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0, 1)[:, :, :])) status = write_status('encoding snps...', 70, status) # Encode the information for each gene in {-1,0,1}: status = write_status('encoding snps...', 73, status) snps = (snpstr == ref[None, :, :]) status = write_status('encoding snps...', 76, status) snps = (snps * np.array([1, -1])[None, None, :]) status = write_status('encoding snps...', 78, status) snps = snps.sum(-1) status = write_status('encoding snps', 81, status) snps = snps.astype('S1') status = write_status('marking nan values...', 88, status) # put in nan values (masked as -128): snps[inan] = -128 status = write_status('setting up meta...', 94, status) # get meta information: metaheader = np.r_[[ 'family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype' ]] metadf = DataFrame(columns=metaheader, data=snpstrnp[:, :6]) metadf.set_index('iid', inplace=1) metadf = metadf.join(infodf.population) metadf.to_pickle(preprocessed_data_paths[1]) # put everything together: status = write_status('setting up snps...', 96, status) snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:, 1]) snpsdf.to_pickle(preprocessed_data_paths[0]) status = write_status('setting up snps...', 98, status) inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:, 1]) inandf.to_pickle(preprocessed_data_paths[2]) status = write_status('done :)', 100, status) print '' else: print "loading snps..." snpsdf = read_pickle(preprocessed_data_paths[0]) print "loading metainfo..." metadf = read_pickle(preprocessed_data_paths[1]) print "loading nan entries..." inandf = read_pickle(preprocessed_data_paths[2]) snps = snpsdf.values populations = metadf.population.values.astype('S3') hapmap = dict(name=data_set, description='The HapMap phase three SNP dataset - ' '1184 samples out of 11 populations. inan is a ' 'boolean array, containing wheather or not the ' 'given entry is nan (nans are masked as ' '-128 in snps).', snpsdf=snpsdf, metadf=metadf, snps=snps, inan=inandf.values, inandf=inandf, populations=populations) return hapmap
# -*- coding: utf-8 -*- """ Created on Thu Sep 26 14:30:20 2019 @author: Student """ import pandas as pd from pandas import DataFrame df_irisbd = DataFrame.from_csv(r"C:\\deepa\\iris.data", header=None, index_col=None) print(df_irisbd) X = df_irisbd.iloc[:, :-1].values y = df_irisbd.iloc[:, 4].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=100) print(y_test) from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=3) # Train the model using the training sets model.fit(X_train, y_train) predicted = model.predict(X_test) # 0:Overcast, 2:Mild from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, predicted))
for key in mergeDataID: if key in clinicalCases: mergeDataID[key].update(clinicalCases[key]) else: popID.append(key) for i in popID: mergeDataID.pop(i, None) test = list(mergeDataID["14313474-376f-4606-9ed9-25ed2acff411"].values()) test2 = list(mergeDataID["14313474-376f-4606-9ed9-25ed2acff411"].keys()) #Added Mutations and Genes from pandas import DataFrame df = DataFrame.from_csv("lungData.tsv", sep="\t") dataDict = df.to_dict() temp = dataDict["Mutations"] mutDict = { k: { "simple_somatic_mutations": int(''.join(v.split(','))) } for k, v in temp.items() } temp = dataDict["Genes"] genesDict = { k: { "genes_with_simple_somatic_mutations": int(''.join(v.split(','))) } for k, v in temp.items()
try: from bs4 import BeautifulSoup except ImportError: from BeautifulSoup import BeautifulSoup import time, requests, urllib2, webbrowser, os, csv import geocoder, math from pandas import DataFrame #&markers=color:blue%7Clabel:S%7C40.702147,-74.015794 df = DataFrame.from_csv('WEST BENGAL_data.csv', sep=',', parse_dates=False) df.insert(11, 'ZIP', 'NA') map_string_list = [] safe_markers = [] unsafe_markers = [] lat_long_list = [] safe_zips = [] unsafe_zips = [] final_string = '' unsafe_info_window_string = [] safe_info_window_string = [] for index, row in df.iterrows(): lat = row[8] lng = row[9] source = row[0] label = row[7] if lat == 'NA' and lng == 'NA': continue