def test_pandas_rolling_expanding(): """Test pandas.(Series|DataFrame).(rolling|expanding)""" try: from numpy.random import randint import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True) series = pd.Series(randint(0, 50, (123,))) res1 = series.rolling(10).progress_apply(lambda x: 1, raw=True) res2 = series.rolling(10).apply(lambda x: 1, raw=True) assert res1.equals(res2) res3 = series.expanding(10).progress_apply(lambda x: 2, raw=True) res4 = series.expanding(10).apply(lambda x: 2, raw=True) assert res3.equals(res4) expects = ['114it'] # 123-10+1 for exres in expects: our_file.seek(0) if our_file.getvalue().count(exres) < 2: our_file.seek(0) raise AssertionError( "\nExpected:\n{0}\nIn:\n{1}\n".format( exres + " at least twice.", our_file.read()))
def test_pandas_series(): """Test pandas.Series.progress_apply and .progress_map""" try: from numpy.random import randint import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True) series = pd.Series(randint(0, 50, (123,))) res1 = series.progress_apply(lambda x: x + 10) res2 = series.apply(lambda x: x + 10) assert res1.equals(res2) res3 = series.progress_map(lambda x: x + 10) res4 = series.map(lambda x: x + 10) assert res3.equals(res4) expects = ['100%', '123/123'] for exres in expects: our_file.seek(0) if our_file.getvalue().count(exres) < 2: our_file.seek(0) raise AssertionError( "\nExpected:\n{0}\nIn:\n{1}\n".format( exres + " at least twice.", our_file.read()))
def test_pandas_groupby_apply(): """ Test pandas.DataFrame.groupby(...).progress_apply """ try: from numpy.random import randint import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=False, ascii=True) df = pd.DataFrame(randint(0, 50, (500, 3))) df.groupby(0).progress_apply(lambda x: None) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) dfs.groupby(['a']).progress_apply(lambda x: None) our_file.seek(0) # don't expect final output since no `leave` and # high dynamic `miniters` nexres = '100%|##########|' if nexres in our_file.read(): our_file.seek(0) raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format( nexres, our_file.read()))
def test_pandas_groupby_apply(): """Test pandas.DataFrame.groupby(...).progress_apply""" try: from numpy.random import randint import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=False, ascii=True) df = pd.DataFrame(randint(0, 50, (500, 3))) df.groupby(0).progress_apply(lambda x: None) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) dfs.groupby(['a']).progress_apply(lambda x: None) our_file.seek(0) # don't expect final output since no `leave` and # high dynamic `miniters` nexres = '100%|##########|' if nexres in our_file.read(): our_file.seek(0) raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format( nexres, our_file.read())) with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) dfs.loc[0] = [2, 1, 1] dfs['d'] = 100 expects = ['500/500', '1/1', '4/4', '2/2'] dfs.groupby(dfs.index).progress_apply(lambda x: None) dfs.groupby('d').progress_apply(lambda x: None) dfs.groupby(dfs.columns, axis=1).progress_apply(lambda x: None) dfs.groupby([2, 2, 1, 1], axis=1).progress_apply(lambda x: None) our_file.seek(0) if our_file.read().count('100%') < 4: our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format( '100% at least four times', our_file.read())) for exres in expects: our_file.seek(0) if our_file.getvalue().count(exres) < 1: our_file.seek(0) raise AssertionError( "\nExpected:\n{0}\nIn:\n {1}\n".format( exres + " at least once.", our_file.read()))
def test_pandas_setup(): """Test tqdm.pandas()""" try: from numpy.random import randint import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True, total=123) series = pd.Series(randint(0, 50, (100,))) series.progress_apply(lambda x: x + 10) res = our_file.getvalue() assert '100/123' in res
def test_pandas_map(): """ Test pandas.Series.progress_map """ try: from numpy.random import randint import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list("abc")) dfs.a.progress_map(lambda x: None) if our_file.getvalue().count("100%") < 1: raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format("100% at least twice", our_file.getvalue()))
def map_translate(x): print("Begining to translate") tqdm.pandas(tqdm()) x['en_desc']=x['description'].progress_map(translate) print("Done translating decription") print("Begining to translate Title") x['en_title']=x['title'].progress_map(translate) print("Done translating") print("Begining to translate region") x['en_region']=x['region'].progress_map(translate) print("Done translating") print("Begining to translate city") x['en_city']=x['city'].progress_map(translate) print("Done translating") print("Begining to translate category_name") x['en_category_name']=x['category_name'].progress_map(translate) print("Done translating") return x
def test_pandas_leave(): """ Test pandas with `leave=True` """ try: from numpy.random import randint import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 100, (1000, 6))) tqdm.pandas(file=our_file, leave=True, ascii=True) df.groupby(0).progress_apply(lambda x: None) our_file.seek(0) exres = "100%|##########| 101/101" if exres not in our_file.read(): our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format(exres, our_file.read()))
def __debug_angle_error_per_number_of_bins_snr_and_samples(): from tqdm import tqdm import pandas as pd import numpy as np tqdm.pandas() # a = [[10, 100, 1000, 10000], [100, 300], [100, 1000, 10000], [None]] # a = [[100], [100], [10000], [0.1, 0.01]] # inx = pd.MultiIndex.from_product(a, names=['samples', 'hist_bins', 'snr', 'quant_size']) a = dict(samples=[10, 100, 1000, 10000], hist_bins=[100, 300], snr=[100, 1000, 10000], quant_size=[0]) a = dict(samples=[100], hist_bins=[100], snr=[10000], quant_size=[0.2, 0.1, 0.01, 0.0001]) inx = pd.MultiIndex.from_product(a.values(), names=a.keys()) df = pd.DataFrame(index=inx).reset_index(drop=False).sample(frac=1) df = pd.concat([df] * 10000, ignore_index=True) df = df.join(df.progress_apply(lambda row: compare_sinogram_and_eigen_vector(**row.to_dict()), axis=1).apply(pd.Series)) df.to_csv('angle_error_by_sinogram.csv', header=None, mode='a') # df.to_csv('angle_error_by_sinogram.csv', mode='w') print(df.head())
def main(): # open df_train = pd.read_csv('../middle/train_lemma_stem.csv') df_test = pd.read_csv('../middle/test_lemma_stem.csv') # drop None df_train = df_train.replace(np.nan, ' ', regex=True) df_test = df_test.replace(np.nan, ' ', regex=True) # progress bars initialize tqdm.pandas(desc="my bar!") loop_tuple1 = [('punct_re1', 'question1'), ('punct_re2', 'question2')] loop_tuple2 = [('stem_q1', 'punct_re1'), ('stem_q2', 'punct_re2')] loop_tuple3 = [('lemmas_q1', 'punct_re1'), ('lemmas_q2', 'punct_re2')] # implement functions punct_re, spell_lemma, spell_stem to df_train and df_test dataframes df_train, df_test = implement_fun(df_train, df_test, punct_re, loop_tuple1) df_train, df_test = implement_fun(df_train, df_test, spell_lemma, loop_tuple2) df_train, df_test = implement_fun(df_train, df_test, spell_stem, loop_tuple3)
def test_pandas_data_frame(): """Test pandas.DataFrame.progress_apply and .progress_applymap""" try: from numpy.random import randint import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True) df = pd.DataFrame(randint(0, 50, (100, 200))) def task_func(x): return x + 1 # applymap res1 = df.progress_applymap(task_func) res2 = df.applymap(task_func) assert res1.equals(res2) # apply for axis in [0, 1, 'index', 'columns']: res3 = df.progress_apply(task_func, axis=axis) res4 = df.apply(task_func, axis=axis) assert res3.equals(res4) our_file.seek(0) if our_file.read().count('100%') < 3: our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format( '100% at least three times', our_file.read())) # apply_map, apply axis=0, apply axis=1 expects = ['20000/20000', '200/200', '100/100'] for exres in expects: our_file.seek(0) if our_file.getvalue().count(exres) < 1: our_file.seek(0) raise AssertionError( "\nExpected:\n{0}\nIn:\n {1}\n".format( exres + " at least once.", our_file.read()))
def test_pandas_apply(): """ Test pandas.DataFrame[.series].progress_apply """ try: from numpy.random import randint import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: tqdm.pandas(file=our_file, leave=True, ascii=True) df = pd.DataFrame(randint(0, 50, (500, 3))) df.progress_apply(lambda x: None) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) dfs.a.progress_apply(lambda x: None) our_file.seek(0) if our_file.read().count('100%') < 2: our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format( '100% at least twice', our_file.read()))
def postprocess(self, fps, sampling_rate=1, use_kalman=False): """ This function should be called after loading the data by loader It performs the following steps: -: check fps value, should be set and bigger than 0 -: check critical columns should exist in the table -: update data types -: fill 'groumates' if they are not set -: checks if velocity do not exist, compute it for each agent -: compute bounding box of trajectories :param fps: video framerate :param sampling_rate: if bigger than one, the data needs downsampling, otherwise needs interpolation :param use_kalman: for smoothing agent velocities :return: None """ # check for critical_column in self.critical_columns: if critical_column not in self.data: raise ValueError( "Error! some critical columns are missing from trajectory dataset!" ) # modify data types self.data["frame_id"] = self.data["frame_id"].astype(int) if str(self.data["agent_id"].iloc[0]).replace('.', '', 1).isdigit(): self.data["agent_id"] = self.data["agent_id"].astype(int) self.data["pos_x"] = self.data["pos_x"].astype(float) self.data["pos_y"] = self.data["pos_y"].astype(float) self.data["label"] = self.data["label"].str.lower( ) # search with lower-case labels # fill scene_id if "scene_id" not in self.data: self.data["scene_id"] = 0 self.fps = fps # fill timestamps based on frame_id and video_fps if "timestamp" not in self.data: self.data["timestamp"] = self.data["frame_id"] / fps # fill groupmates agent_ids = pd.unique(self.data["agent_id"]) for agent_id in agent_ids: if agent_id not in self.groupmates: self.groupmates[agent_id] = [] # down/up sampling frames if sampling_rate >= 2: # FixMe: down-sampling sampling_rate = int(sampling_rate) self.data = self.data.loc[(self.data["frame_id"] % sampling_rate) == 0] self.data = self.data.reset_index() elif sampling_rate < (1 - 1E-2): # TODO: interpolation pass else: pass # remove the trajectories shorter than 2 frames data_grouped = self.data.groupby(["scene_id", "agent_id"]) single_length_inds = data_grouped.head(1).index[ data_grouped.size() < 2] self.data = self.data.drop(single_length_inds) # fill velocities if "vel_x" not in self.data: data_grouped = self.data.groupby(["scene_id", "agent_id"]) dt = data_grouped["timestamp"].diff() if (dt > 2).sum(): print('Warning! too big dt in [%s]' % self.title) self.data["vel_x"] = (data_grouped["pos_x"].diff() / dt).astype(float) self.data["vel_y"] = (data_grouped["pos_y"].diff() / dt).astype(float) nan_inds = np.array(np.nonzero(dt.isnull().to_numpy())).reshape(-1) self.data["vel_x"].iloc[nan_inds] = self.data["vel_x"].iloc[ nan_inds + 1].to_numpy() self.data["vel_y"].iloc[nan_inds] = self.data["vel_y"].iloc[ nan_inds + 1].to_numpy() # ============================================ if use_kalman: def smooth(group): if len(group) < 2: return group dt = group["timestamp"].diff().iloc[1] kf = KalmanModel(dt, n_dim=2, n_iter=7) smoothed_pos, smoothed_vel = kf.smooth( group[["pos_x", "pos_y"]].to_numpy()) group["pos_x"] = smoothed_pos[:, 0] group["pos_y"] = smoothed_pos[:, 1] group["vel_x"] = smoothed_vel[:, 0] group["vel_y"] = smoothed_vel[:, 1] return group tqdm.pandas(desc="Smoothing trajectories (%s)" % self.title) # print('Smoothing trajectories ...') data_grouped = self.data.groupby(["scene_id", "agent_id"]) self.data = data_grouped.progress_apply(smooth)
# importando as bibliotecas import pandas as pd import numpy as np from matplotlib import pyplot as plt import seaborn as sns from tqdm import tqdm import pandas_profiling %matplotlib inline tqdm.pandas(desc="Operation Progress") test_file = pd.read_csv("train.csv") test_file = test_file.drop('ID_code',axis=1) test_sample = test_file.sample(n=384,random_state=1) # Definindo código para retornar features mais correlatas # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf # Fazer df com amostra de 384 # Rodar os códigos abaixo e gerar um modelo fitando maiores correlações e rodando o predict def get_redundant_pairs(test_sample): '''Get diagonal and lower triangular pairs of correlation matrix''' pairs_to_drop = set() cols = test_sample.columns for i in range(0, test_file.shape[1]): for j in range(0, i+1): pairs_to_drop.add((cols[i], cols[j])) return pairs_to_drop def get_top_abs_correlations(test_sample, n=5): au_corr = test_file.corr().abs().unstack()
def spacy_tokenizer(sentence): mytokens = parser(sentence) mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ] mytokens = " ".join([i for i in mytokens]) return mytokens # Applying the text-processing function on the body_text. print(tqdm.pandas()) df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer) # Let's take a look at word count in the papers import seaborn as sns sns.distplot(df['body_word_count']) print(df['body_word_count'].describe()) sns.distplot(df['body_unique_words']) print(df['body_unique_words'].describe()) # Vectorization from sklearn.feature_extraction.text import TfidfVectorizer
# Sentence size range from 7 to 7944 characters. # ### 3.1 NLP # Now I will tokenize and lemmatize the sentences. # # Tokenization separates the sentences into a set of tokens. # # Lemmatization aims at reducing the inflectional forms of each word into a common base or root. Lemmatization, on the other hand, takes into consideration the morphological analysis of the words. # # I still need to better understand about the stop words removal in the contaxt of Text Classification. I do believe (for now) that we should not remove the negative words at least. Some discussion [here](https://datascience.stackexchange.com/questions/31048/pros-cons-of-stop-word-removal), [here](https://www.researchgate.net/post/Does_Pre-processing_step_Remove_Stop_Word_effect_Sentiment_Analysis_result) and [here](https://stackoverflow.com/questions/40144473/do-we-need-to-use-stopwords-filtering-before-pos-tagging) # In[17]: tqdm.pandas() # In[18]: df['tokens'] = df['sentence'].progress_apply(lambda d: lemmatize_sent(d), df.sentence, []) # df['tokens'] = df.sentence.progress_apply(lambda d: lemmatize_sent(d), [i for i in df.sentence.values], []) # flatten = lambda data: reduce(lambda x, y: x + y, data, []) # In[19]: df['tokens'][:3] # In[20]: sum(df['sentence'].isnull())
def urm_neg_score_user(mode, _last_click_score=1, _clicked_ref_score=1, _impr_not_seen_score=0, _seen_ref_score=1, cluster='no_cluster'): global impr_not_seen_score, last_click_score, seen_ref_score, clicked_ref_score impr_not_seen_score = _impr_not_seen_score last_click_score = _last_click_score clicked_ref_score = _clicked_ref_score seen_ref_score = _seen_ref_score save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode) accomodations_array = data.accomodations_ids() # load the dataframes according to the mode and cluster train_df = data.train_df(mode=mode, cluster=cluster) test_df = data.test_df(mode=mode, cluster=cluster) # fill missing clickout_item on the test dataframe test_df.fillna({'reference': -1}, inplace=True) train_df.fillna({'reference': -1}, inplace=True) # concatenate the train df and the test df mantaining only the columns of interest df = pd.concat([train_df, test_df])[[ 'session_id', 'user_id', 'action_type', 'reference', 'impressions' ]] session_groups = df.groupby(['user_id']) session_ids = list(session_groups.groups.keys()) rows_count = len(session_groups) cols_count = len(accomodations_array) # create dictionary (k: sessionId - v: urm row) row_of_sessionid = {} for i in range(len(session_ids)): row_of_sessionid[session_ids[i]] = i # create dictionary (k: accomodationId - v: urm col) col_of_accomodation = {} for i in range(cols_count): col_of_accomodation[accomodations_array[i]] = i print('dictionaries created\n') tqdm.pandas() sessions_score = session_groups.progress_apply( _session_score_negative_value_seen_elem).values print("apply function done\n") # create the urm using data indeces and indptr _data = [] indptr = [0] indices = [] values_inserted = 0 for i in tqdm(range(rows_count)): score_dict = sessions_score[i] for k in score_dict.keys(): # TODO: FIND WHY THERE IS A KEY EQUAL -1 if k != -1: indices.append(col_of_accomodation[k]) _data.append(score_dict[k]) values_inserted += 1 indptr.append(values_inserted) _urm = sps.csr_matrix((_data, indices, indptr), shape=(rows_count, cols_count)) print("URM created\n") #check if the folder where to save exsist cf.check_folder(save_path) print('Saving urm matrix... ') sps.save_npz('{}/urm_negative_user.npz'.format(save_path), _urm) print('done!') print('Saving row dictionary... ') np.save('{}/dict_row_user.npy'.format(save_path), row_of_sessionid) print('done!') print('Saving col dictionary... ') np.save('{}/dict_col_user.npy'.format(save_path), col_of_accomodation) print('done!')
def loadCSVs( self, tokenFilename: str = "data_aspects_tokens.csv", preprocessedFilename: str = "data_preprocessed.csv", lexiconFilename: str = "sentiment_lexicon.csv", ) -> bool: """ load all necessary CSV for execution of the detector and set indices as appropriate Args: tokenFilename (str, optional): Defaults to "data_aspects_tokens.csv". preprocessedFilename (str, optional): Defaults to "data_preprocessed.csv". lexiconFilename (str, optional): Defaults to "sentiment_lexicon.csv". Returns: bool: successful execution """ try: if self.df_aspect_tokens is None or self.df_aspect_tokens.empty: self.df_aspect_tokens = PD.read_csv(self.path + tokenFilename) self.df_aspect_tokens["polarity_strength"] = PD.NaT self.df_aspect_tokens["polarity_strength"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["sentiment_words"] = PD.NaT self.df_aspect_tokens["sentiment_words"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["intensifier_words"] = PD.NaT self.df_aspect_tokens["intensifier_words"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["word_found"] = self.df_aspect_tokens[ "word_found"].str.replace(r"[^\w]*", "", regex=True) # TODO remove after debugging # self.df_aspect_tokens = self.df_aspect_tokens[:100] if self.df_preprocessed is None or self.df_preprocessed.empty: self.df_preprocessed = PD.read_csv(self.path + preprocessedFilename) # pandas read_csv does not read arrays correctly so we need to adjust those tqdm.pandas(desc="Applying Datatype Transformations....") self.df_preprocessed["tokens"] = self.df_preprocessed[ "tokens"].progress_apply(lambda x: json.loads(x)) if self.df_lexicon is None or self.df_lexicon.empty: if not os.path.exists(self.path + lexiconFilename): self.downloadLexicon() self.df_lexicon = PD.read_csv(self.path + lexiconFilename) self.df_lexicon.drop_duplicates(subset=["word", "qualifier"], inplace=True) self.df_lexicon.set_index("word", inplace=True) self.df_lexicon.drop("%%") return True except IOError as e: print(e) return False
""" import pickle import os import pandas as pd import torch import spacy import re from itertools import permutations from tqdm import tqdm from .preprocessing_funcs import load_dataloaders from ..misc import save_as_pickle import logging tqdm.pandas(desc="prog-bar") logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \ datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logger = logging.getLogger('__file__') def load_pickle(filename): completeName = os.path.join(os.getcwd() + "/src/data/",\ filename) with open(completeName, 'rb') as pkl_file: data = pickle.load(pkl_file) return data class infer_from_trained(object): def __init__(self, args=None, detect_entities=False): if args is None: self.args = load_pickle("args.pkl")
def annotate(self) -> None: """ function to call the "findAspects()" function for every row """ tqdm.pandas(desc="Finding Aspects!") self.data.progress_apply(lambda x: self.findAspects(x), axis=1)
def run(self): tqdm.pandas(tqdm) kaggle_train_data = pandas.read_csv( os.path.expanduser('~/Datasets/Kaggle-Quora/train.csv')).drop( 'id', 1) a = kaggle_train_data.qid1.apply( lambda v: mmh3.hash(str(v).encode('ascii'), 2213) % 128 > 1) b = kaggle_train_data.qid2.apply( lambda v: mmh3.hash(str(v).encode('ascii'), 6663) % 128 > 1) print((a & b).sum(), np.invert(a).sum(), np.invert(b).sum()) print('Raw training questions') kaggle_train_data['question1_raw'] = kaggle_train_data[ 'question1'].fillna('') kaggle_train_data['question2_raw'] = kaggle_train_data[ 'question2'].fillna('') print('Clean training tokens') kaggle_train_data['question1_tokens'] = kaggle_train_data[ 'question1_raw'].progress_apply(clean_text) kaggle_train_data['question2_tokens'] = kaggle_train_data[ 'question2_raw'].progress_apply(clean_text) assert kaggle_train_data['question1_tokens'].str.len().max( ) > 0, 'No tokens 1 found' assert kaggle_train_data['question2_tokens'].str.len().max( ) > 0, 'No tokens 2 found' print('Clean training questions') kaggle_train_data['question1_clean'] = kaggle_train_data[ 'question1_tokens'].progress_apply(' '.join) kaggle_train_data['question2_clean'] = kaggle_train_data[ 'question2_tokens'].progress_apply(' '.join) train_data = kaggle_train_data[a & b].reset_index(drop=True) merge_data = kaggle_train_data[np.invert(b)].reset_index(drop=True) valid_data = kaggle_train_data[np.invert(a)].reset_index(drop=True) print('Writing training data') self.output().makedirs() train_data.to_msgpack('cache/dataset/train.msg') merge_data.to_msgpack('cache/dataset/merge.msg') valid_data.to_msgpack('cache/dataset/valid.msg') del train_data, valid_data, kaggle_train_data kaggle_test_data = pandas.read_csv( os.path.expanduser('~/Datasets/Kaggle-Quora/test.csv')) print('Raw testing questions') kaggle_test_data['question1_raw'] = kaggle_test_data[ 'question1'].fillna('') kaggle_test_data['question2_raw'] = kaggle_test_data[ 'question2'].fillna('') print('Clean testing tokens') kaggle_test_data['question1_tokens'] = kaggle_test_data[ 'question1_raw'].progress_apply(clean_text) kaggle_test_data['question2_tokens'] = kaggle_test_data[ 'question2_raw'].progress_apply(clean_text) print('Clean testing questions') kaggle_test_data['question1_clean'] = kaggle_test_data[ 'question1_tokens'].progress_apply(' '.join) kaggle_test_data['question2_clean'] = kaggle_test_data[ 'question2_tokens'].progress_apply(' '.join) kaggle_test_data['is_duplicate'] = -1 kaggle_test_data.to_msgpack('cache/dataset/test.msg') with self.output().open('w') as f: f.write('done')
with open(fname, "r") as f: for word in f: english_long.add(word.strip()) # ## Create lists of stopwords, punctuation, and unicode characters stop_words_list = stopwords_make( ) # Define old vocab file path if you want to remove first, dirty elements unicode_list = unicode_make() punctstr = punctstr_make() print("Stopwords, Unicodes, Punctuations lists creation complete!") #word2vec computation whole_text_unnested = [] whole_text_nested = [] tqdm.pandas(desc="Cleaning text") for school in tqdm(df['text'], desc="Cleaning text"): doc = [] for chunk in school.split("\n"): for sent in sent_tokenize(chunk): sent = clean_sentence_apache(sent, unhyphenate=True, remove_propernouns=False, remove_acronyms=False) sent = [word for word in sent if word != ''] if len(sent) > 0: whole_text_unnested.append(sent) doc.append(sent) whole_text_nested.append(doc)
def main(): print("Loading Data...") names = ["OFFENSE_TYPE_ID", "OFFENSE_CATEGORY_ID", "FIRST_OCCURRENCE_DATE", "REPORTED_DATE", "DISTRICT_ID", "PRECINCT_ID", "NEIGHBORHOOD_ID", "IS_CRIME", "IS_TRAFFIC"] filePath="crime.csv" pd.set_option('display.float_format', '{:.2f}'.format) fileExist=False if(os.path.exists('crime-treated.csv')): fileExist =True filePath="crime-treated.csv" names =["HOUR_REPORTED","DAY_REPORTED","WEEKDAY_REPORTED","MONTH_REPORTED","YEAR_REPORTED", "OFFENSE_CATEGORY_ID","NEIGHBORHOOD_ID"] data=pd.read_csv(filePath, parse_dates=True,usecols=names,nrows = None) else: data=pd.read_csv(filePath, parse_dates=True) print("======================================================================DATA INFO======================================") data.info() print("===================================DATA INFO======================================") if(not fileExist): print(data.head(5)) display(data.groupby([data.OFFENSE_CODE,data.OFFENSE_CODE_EXTENSION,data.OFFENSE_TYPE_ID]).size()) temp=display(data.groupby([data.INCIDENT_ID,data.OFFENSE_CODE,data.OFFENSE_CODE_EXTENSION,data.OFFENSE_TYPE_ID]).size()) print(temp) treatData(data) crimesDict = { 'all-other-crimes': 1, 'larceny' : 1, 'theft-from-motor-vehicle' : 3, 'drug-alcohol' : 2, 'auto-theft' : 3, 'white-collar-crime': 1, 'burglary': 2, 'public-disorder' : 2, 'aggravated-assault': 3, 'other-crimes-against-persons' : 2, 'robbery' : 3, 'sexual-assault' : 3, 'murder': 3, 'arson': 2 } tqdm.pandas() print("Calculating Offense Weigh...") data['OFFENSE_WEIGH'] = data.progress_apply(lambda row: crimesDict[row.OFFENSE_CATEGORY_ID], axis=1 ) dataCount = data.groupby(['MONTH_REPORTED','WEEKDAY_REPORTED','HOUR_REPORTED','NEIGHBORHOOD_ID']).MONTH_REPORTED.agg('count').to_frame('COUNT').reset_index() dataClenad =data.groupby(['MONTH_REPORTED','WEEKDAY_REPORTED','HOUR_REPORTED','NEIGHBORHOOD_ID'], as_index=False).agg({'OFFENSE_WEIGH':'sum'})#['OFFENSE_WEIGH'].sum()['GEO_X'].mean() dataClenad['COUNT'] = dataCount['COUNT'] print(dataClenad) print("Calculating Safety ...") medianCrime = dataClenad['OFFENSE_WEIGH'].median() modeCrime = dataClenad['OFFENSE_WEIGH'].mode().values modeQtd = dataClenad['COUNT'].mode().values print("CRIME MODE: ",modeCrime) dataClenad['SAFETY'] = dataClenad.progress_apply(lambda row: 1 if row.OFFENSE_WEIGH <= modeCrime and row.COUNT <= modeQtd else 0, axis=1 ) counts = dataClenad['SAFETY'].value_counts() # CratGraph(dataClenad) ExecuteKNN(dataClenad) ExecuteDecisionTree(dataClenad) le = LabelEncoder() dataClenad = dataClenad.progress_apply(le.fit_transform) x_columns = ['MONTH_REPORTED', 'WEEKDAY_REPORTED', 'HOUR_REPORTED', 'NEIGHBORHOOD_ID', 'OFFENSE_WEIGH', 'COUNT'] y_columns = ['SAFETY'] x_train, x_test = train_test_split(dataClenad[x_columns], test_size=0.3) y_train, y_test = train_test_split(dataClenad[y_columns].values.ravel(), test_size=0.3) bagging(5, 200, dataClenad, np.ravel(y_train, order='C'), x_train) exit(0)
import re import sys import glob import string from pprint import pprint from collections import Counter, OrderedDict import spacy nlp = spacy.load('en', disable=['parser', 'tagger', 'ner']) import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from tqdm import tqdm, tqdm_notebook, tnrange tqdm.pandas(desc='Progress') from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from torch.autograd import Variable import pickle import math import gensim.models
sep='\t')) # create a dictionary locations : coordinates df_loc = df_loc.sort_values(by=['ADM1']) df_loc = df_loc.drop_duplicates(subset=['FULL_NAME_ND_RO'], keep='first') loc_dict = dict(zip(df_loc.FULL_NAME_ND_RO, zip(df_loc.LAT, df_loc.LONG))) loc_dict import re, string from tqdm import tqdm tqdm.pandas(desc="progress") from tqdm import tqdm_notebook from fuzzywuzzy import process from fuzzywuzzy import fuzz remove_keywords = [ 'nan', 'Planes de Emergencia', 'Cruz Roja', 'Media Luna Roja', 'Guatemala', 'Argentina', 'Japon', 'Juventud' ] remove_keywords.extend( ['Malawi', 'Lebanon', 'لبنان', 'Nederland', 'Netherlands'])
import pandas as pd import numpy as np import csv import difflib from tqdm import tqdm tqdm.pandas(desc="Fuzzy Match Progress") patent_assignee_mapping = pd.read_csv("./patent_assignee/patent_assignee.tsv", sep="\t", low_memory=False) patent_assignee_mapping.head() patent_results = pd.read_csv( "../Phase-1-Search-Term/green-technology/all-patent-green-terms-searches.csv", dtype={ 'Emerging low carbon-Additional energy sources': bool, 'Emerging low carbon-All-purpose': bool, 'Emerging low carbon-Alternative fuel': bool, 'Emerging low carbon-Alternative fuel vehicle': bool, 'Emerging low carbon-Battery': bool, 'Emerging low carbon-Building technologies': bool, 'Emerging low carbon-Carbon capture & storage': bool, 'Emerging low carbon-Electrochemical processes': bool, 'Emerging low carbon-Energy management': bool, 'Environmental-Air pollution': bool, 'Environmental-All-purpose': bool, 'Environmental-Biological treatment': bool, 'Environmental-Contaminated land reclamation & remediation': bool, '"Environmental-Environmental monitoring': bool, ' instrumentation and analysis"': bool, 'Environmental-Marine pollution control': bool,
# -*- coding: utf-8 -*- """ Created on Thu Jul 26 21:20:13 2018 @author: 徐嘉诚 """ import pandas as pd import numpy as np import gc #import os import datetime from sklearn.metrics import mean_squared_error from tqdm import tqdm, tqdm_notebook tqdm.pandas(tqdm_notebook) #filelist = os.listdir('../data') def get_beautiful_test(test): test_rnd = np.round(test.iloc[:, 1:], 2) ugly_indexes = [] non_ugly_indexes = [] for idx in tqdm(range(len(test))): if not np.all(test_rnd.iloc[idx, :].values == test.iloc[idx, 1:].values): ugly_indexes.append(idx) else: non_ugly_indexes.append(idx) print(len(ugly_indexes), len(non_ugly_indexes)) np.save('test_ugly_indexes', np.array(ugly_indexes))
import torch import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from matplotlib import pyplot as plt import cv2 from tqdm import tqdm import multiprocessing as mp from preprocessing import read_images from prototypicalNet import PrototypicalNet, train_step, test_step tqdm.pandas(desc="my bar!") def main(): trainx, trainy = read_images('../input/omniglot/images_background/') testx, testy = read_images('../input/omniglot/images_evaluation/') use_gpu = torch.cuda.is_available() trainx = torch.from_numpy(trainx).float() testx = torch.from_numpy(testx).float() if use_gpu: trainx = trainx.cuda() testx = testx.cuda() print(trainx.size(), testx.size()) num_episode = 16000 frame_size = 1000 trainx = trainx.permute(0, 3, 1, 2) testx = testx.permute(0, 3, 1, 2) frame_loss = 0 frame_acc = 0 for i in range(num_episode): loss, acc = train_step(trainx, trainy, 5, 60, 5) frame_loss += loss.data
def urm_session_aware(mode, action_score_dict, cluster='no_cluster', time_weight='lin'): """ Create the URM considering the whole session of a user and giving scores based on its interactions :param train_df: :param test_df: :param time_weight: :param save_path: :param save: :return: """ global tw tw = time_weight save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode) accomodations_array = data.accomodations_ids() # load the dataframes according to the mode and cluster train_df = data.train_df(mode=mode, cluster=cluster) test_df = data.test_df(mode=mode, cluster=cluster) # fill missing clickout_item on the test dataframe test_df.fillna({'reference': -1}, inplace=True) train_df.fillna({'reference': -1}, inplace=True) # concatenate the train df and the test df mantaining only the columns of interest df = pd.concat([train_df, test_df])[[ 'session_id', 'user_id', 'action_type', 'reference', 'impressions' ]] session_groups = df.groupby(['session_id', 'user_id']) session_ids = list(session_groups.groups.keys()) rows_count = len(session_groups) cols_count = len(accomodations_array) # create dictionary (k: sessionId - v: urm row) row_of_sessionid = {} for i in range(len(session_ids)): row_of_sessionid[session_ids[i]] = i # create dictionary (k: accomodationId - v: urm col) col_of_accomodation = {} for i in range(cols_count): col_of_accomodation[accomodations_array[i]] = i print('dictionaries created\n') tqdm.pandas() sessions_score = session_groups.progress_apply( _compute_session_score).values print("apply function done\n") # create the urm using data indeces and indptr _data = [] indptr = [0] indices = [] values_inserted = 0 for i in tqdm(range(rows_count)): score_dict = sessions_score[i] for k in score_dict.keys(): indices.append(col_of_accomodation[k]) _data.append(score_dict[k]) values_inserted += 1 indptr.append(values_inserted) _urm = sps.csr_matrix((_data, indices, indptr), shape=(rows_count, cols_count)) print("URM created\n") #check if the folder where to save exsist cf.check_folder(save_path) print('Saving urm matrix... ') sps.save_npz('{}/urm_session_aware1_{}.npz'.format(save_path, time_weight), _urm) print('done!') print('Saving row dictionary... ') np.save('{}/dict_row.npy'.format(save_path), row_of_sessionid) print('done!') print('Saving col dictionary... ') np.save('{}/dict_col.npy'.format(save_path), col_of_accomodation) print('done!')
import pandas as pd import pandas_datareader as web import numpy as np import seaborn as sns import matplotlib.pyplot as plt import os import re from tqdm import tqdm tqdm.pandas() # need to run in notebook import warnings warnings.filterwarnings(action='ignore') pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) #-----------sklearn---------------------------------------------------- from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split #-----------other_models--------------------------------------------------- from xgboost import XGBClassifier from lightgbm import LGBMClassifier #-----------misc---------------------------------------------------- import gc from datetime import datetime, timedelta
def create_urm(self): # load the dataframes according to the mode and cluster train_df = data.train_df(mode=self.mode, cluster=self.cluster) test_df = data.test_df(mode=self.mode, cluster=self.cluster) # fill missing clickout_item on the test dataframe test_df.fillna({'reference': -1}, inplace=True) train_df.fillna({'reference': -1}, inplace=True) # concatenate the train df and the test df mantaining only the columns of interest df = pd.concat([train_df, test_df])[[ 'session_id', 'user_id', 'action_type', 'reference', 'impressions' ]] if self.type == 'user': session_groups = df.groupby(['user_id']) if self.type == 'session': session_groups = df.groupby(['user_id', 'session_id']) # it coincides with the rows number groups_keys = list(session_groups.groups.keys()) rows_count = len(groups_keys) cols_count = len(self.accomodations_id) """ create ROW dictionary if type == USER : key: user_id -- value: row_urm if type == SESSION : key: (user_id, session_id) -- value: row_urm """ row_dict = {} for i in range(rows_count): row_dict[groups_keys[i]] = i """ create COL dictionary key: accomodation_id -- value: col_urm """ col_dict = {} for i in range(cols_count): col_dict[self.accomodations_id[i]] = i print('dictionaries created\n') tqdm.pandas() # compute the score sessions_score = session_groups.progress_apply( self._compute_session_score).values print("apply function done\n") # create the urm using data indeces and indptr _data = [] indptr = [0] indices = [] values_inserted = 0 for i in tqdm(range(rows_count)): score_dict = sessions_score[i] for k in score_dict.keys(): indices.append(col_dict[k]) _data.append(score_dict[k]) values_inserted += 1 indptr.append(values_inserted) _urm = sps.csr_matrix((_data, indices, indptr), shape=(rows_count, cols_count)) print("URM created\n") print('Saving urm matrix... ') sps.save_npz(f'{self.save_path}/{self.name}.npz', _urm) print('done!') print('Saving row dictionary... ') np.save(f'{self.save_path}/{self.name}_dict_row.npy', row_dict) print('done!') print('Saving col dictionary... ') np.save(f'{self.save_path}/{self.name}_dict_col.npy', col_dict) print('done!')
'fuzz_token_sort_ratio_question': fuzz_token_sort_ratio_question, 'fuzz_token_sort_ratio_lemma1': fuzz_token_sort_ratio_lemma1, 'fuzz_token_sort_ratio_lemma2': fuzz_token_sort_ratio_lemma1, 'fuzz_token_set_ratio_question': fuzz_token_set_ratio_question, 'fuzz_token_set_ratio_lemma1': fuzz_token_set_ratio_lemma1, 'fuzz_token_set_ratio_lemma2': fuzz_token_set_ratio_lemma2 } # read train dataframe df_train = pd.read_csv('../../data/Quora_Question_Pairs/middle/train_lemma_stem.csv') # implement function from calc_table to df_train for name, fun in tqdm(calc_table.items()): tqdm.pandas(desc=name) df_train[name] = df_train.progress_apply(fun, axis=1) # save result and delete df_train df_train.to_csv("../../data/Quora_Question_Pairs/middle/train_lemma_stem.csv") del df_train # read test dataframe df_test = pd.read_csv('../../data/Quora_Question_Pairs/middle/test_lemma_stem.csv') # implement function from calc_table to df_train for name, fun in tqdm(calc_table.items()): tqdm.pandas(desc=name) df_test[name] = df_test.progress_apply(fun, axis=1) # save result df_test
def _createNewTrainingSetWithFeatureVariations(basicDf, newFeatureDf, featureOfInterest, variation_degree): import pandas as pd import numpy as np from tqdm import tqdm from utilities.pandasTools import suffixColumnsWithLabel try: # Create and register a new `tqdm` instance with `pandas` # (can use tqdm_gui, optional kwargs, etc.) tqdm.pandas() print( '_createNewTrainingSetWithFeatureVariations check point 1 >>> variation_degree >>> ' + str(variation_degree)) featureVariants = [[ np.exp( suffixColumnsWithLabel(newFeatureDf, '_exp_' + str(iterator)) * iterator), np.exp( suffixColumnsWithLabel(newFeatureDf, '_exp_inv_' + str(iterator)) * iterator * -1), np.power( suffixColumnsWithLabel(newFeatureDf, '_pow_' + str(iterator)), iterator), np.power( suffixColumnsWithLabel(newFeatureDf, '_pow_inv_' + str(iterator)).astype(float), iterator * -1) ] for iterator in range(1, variation_degree + 1)] print('_createNewTrainingSetWithFeatureVariations check point 2') segmentCount, rowCount, colCount = len(featureVariants), len( featureVariants[0]), len(featureVariants[0][0]) cummulativeListOfFeatures = np.empty(segmentCount, dtype=list) print('_createNewTrainingSetWithFeatureVariations check point 3') for segmentItr in range(0, segmentCount - 1): cummulativeListOfFeatures[segmentItr] = pd.DataFrame([]) for rowItr in range(0, rowCount - 1): cummulativeListOfFeatures[segmentItr] = pd.concat([ cummulativeListOfFeatures[segmentItr], featureVariants[segmentItr][rowItr] ], axis=1) print('_createNewTrainingSetWithFeatureVariations check point 4') cummulativeListOfFeatures = pd.concat(cummulativeListOfFeatures, axis=1) newTrainingSetDf = pd.concat( [basicDf, newFeatureDf, cummulativeListOfFeatures], axis=1) print('_createNewTrainingSetWithFeatureVariations check point 5') return newTrainingSetDf except: print("Error executing method >>> ") # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error:", sys.exc_info()) # print(exc_type, fname, exc_tb.tb_lineno) # http://docs.python.org/2/library/sys.html#sys.exc_info exc_type, exc_value, exc_traceback = sys.exc_info( ) # most recent (if any) by default ''' Reason this _can_ be bad: If an (unhandled) exception happens AFTER this, or if we do not delete the labels on (not much) older versions of Py, the reference we created can linger. traceback.format_exc/print_exc do this very thing, BUT note this creates a temp scope within the function. ''' traceback_details = { 'filename': exc_traceback.tb_frame.f_code.co_filename, 'lineno': exc_traceback.tb_lineno, 'name': exc_traceback.tb_frame.f_code.co_name, 'type': exc_type.__name__, 'message': traceback.extract_tb(exc_traceback) } del (exc_type, exc_value, exc_traceback ) # So we don't leave our local labels/objects dangling # This still isn't "completely safe", though! # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2] print print(traceback.format_exc()) print print(traceback_template % traceback_details) print #traceback.print_exception() raise
def load_time_series_dataset(cfg, slide=None): ''' Load the static and time series data from disk and join them. Create time series examples to form large dataset with time series and static features. Partition into train/val/test sets. Normalize numerical data. :param cfg: Project config (from config.yml) :param slide: Int that controls how many recent dates to cut off from the dataset :return: A dict of partitioned and normalized datasets, split into examples and labels ''' # Load data info generated during preprocessing data = {} data['METADATA'] = {} input_stream = open(cfg['PATHS']['DATA_INFO'], 'r') data_info = yaml.full_load(input_stream) data['METADATA']['N_WEEKS'] = data_info['N_WEEKS'] noncat_features = data_info[ 'NON_CAT_FEATURES'] # Noncategorical features to be scaled T_X = cfg['DATA']['TIME_SERIES']['T_X'] tqdm.pandas() # Load data (before and after one-hot encoding) df_ohe = pd.read_csv(cfg['PATHS']['PROCESSED_OHE_DATA']) df = pd.read_csv(cfg['PATHS']['PROCESSED_DATA'] ) # Static and dynamic data prior to one hot encoding time_series_feats = [ f for f in df.columns if ('-Day_' in f) and (')' not in f) ] # Partition dataset by date unique_dates = np.flip(df_ohe['Date'].unique()).flatten() val_split = cfg['TRAIN']['VAL_SPLIT'] if val_split * unique_dates.shape[0] < 1: val_split = 1.0 / unique_dates.shape[ 0] # Ensure validation set contains records from at least 1 time step print("Val set split in config.yml is too small. Increased to " + str(val_split)) test_split = cfg['TRAIN']['TEST_SPLIT'] if test_split * unique_dates.shape[0] < 1: test_split = 1.0 / unique_dates.shape[ 0] # Ensure test set contains records from at least 1 time step print("Test set split in config.yml is too small. Increased to " + str(test_split)) if slide is None: test_df_dates = unique_dates[-int(test_split * unique_dates.shape[0]):] val_df_dates = unique_dates[-int( (test_split + val_split) * unique_dates.shape[0]):-int(test_split * unique_dates.shape[0])] train_df_dates = unique_dates[0:-int((test_split + val_split) * unique_dates.shape[0])] else: test_split_size = max(int((test_split) * unique_dates.shape[0]), 1) val_split_size = max(int((val_split) * unique_dates.shape[0]), 1) offset = slide * test_split_size if offset == 0: test_df_dates = unique_dates[-(test_split_size):] else: test_df_dates = unique_dates[-(test_split_size + offset):-offset] val_df_dates = unique_dates[-(val_split_size + test_split_size + offset):-(test_split_size + offset)] train_df_dates = unique_dates[0:-(val_split_size + test_split_size + offset)] train_df_ohe = df_ohe[df_ohe['Date'].isin(train_df_dates)] val_df_ohe = df_ohe[df_ohe['Date'].isin(val_df_dates)] test_df_ohe = df_ohe[df_ohe['Date'].isin(test_df_dates)] train_df = df[df['Date'].isin(train_df_dates)] test_df = df[df['Date'].isin(test_df_dates)] print('Train set size = ' + str(train_df_ohe.shape[0]) + '. Val set size = ' + str(val_df_ohe.shape[0]) + '. Test set size = ' + str(test_df_ohe.shape[0])) # Save train & test set for LIME train_df.to_csv(cfg['PATHS']['TRAIN_SET'], sep=',', header=True, index=False) test_df.to_csv(cfg['PATHS']['TEST_SET'], sep=',', header=True, index=False) # Anonymize clients train_df_ohe.drop(['ClientID', 'Date'], axis=1, inplace=True) val_df_ohe.drop(['ClientID', 'Date'], axis=1, inplace=True) test_df_ohe.drop(['ClientID', 'Date'], axis=1, inplace=True) # Get indices of noncategorical features noncat_feat_idxs = [ test_df_ohe.columns.get_loc(c) for c in noncat_features if c in test_df_ohe ] # Separate ground truth from dataframe and convert to numpy arrays data['Y_train'] = np.array(train_df_ohe.pop('GroundTruth')) data['Y_val'] = np.array(val_df_ohe.pop('GroundTruth')) data['Y_test'] = np.array(test_df_ohe.pop('GroundTruth')) # Convert feature dataframes to numpy arrays data['X_train'] = np.array(train_df_ohe) data['X_val'] = np.array(val_df_ohe) data['X_test'] = np.array(test_df_ohe) # Normalize numerical data and save the scaler for prediction. col_trans_scaler = ColumnTransformer(transformers=[ ('col_trans_ordinal', StandardScaler(), noncat_feat_idxs) ], remainder='passthrough') data['X_train'] = col_trans_scaler.fit_transform( data['X_train']) # Only fit train data to prevent data leakage data['X_val'] = col_trans_scaler.transform(data['X_val']) data['X_test'] = col_trans_scaler.transform(data['X_test']) dump(col_trans_scaler, cfg['PATHS']['SCALER_COL_TRANSFORMER'], compress=True) data['METADATA']['NUM_TS_FEATS'] = len( time_series_feats) # Number of different time series features data['METADATA']['T_X'] = T_X return data
""" @author - Mohsin """ import numpy as np np.random.seed(786) # for reproducibility import pandas as pd from sklearn.model_selection import KFold from sklearn.preprocessing import MinMaxScaler from tqdm import tqdm tqdm.pandas(tqdm) from utils import * import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) if __name__ == "__main__": LOGGER_FILE = "prepImageFeature3.log" IMAGE_FILE_1 = "../utility/df_image_feats3.csv" ###################### Logger ######################################### handler = logging.FileHandler(LOGGER_FILE) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter(
def get_variant_data(connection, limit=0, testRun = False): ''' Get Variant data for Solr document. ''' # Special variant IDs that represent testAssociationId = [ 42893, # Missing from solr slim 47195, # kgp ID ] # function to retrieve further data from the database: def get_more_variant_data(row): # Extracting basic variant information: resourcename = 'variant' ID = row['ID'] rsID = row['RS_ID'] consequence = str(row['FUNCTIONAL_CLASS']).replace("_", " ").capitalize() # Extracting association count: association_count = variant_cls.get_association_count(ID) # Extracting study count: study_count = variant_cls.get_study_count(ID) # We don't care about variants that have no associations: if association_count == 0: return(1) # Extracting genomic location: location = variant_cls.get_variant_location(ID) # Extracting mapped genes: mapped_genes_list = variant_cls.get_mapped_genes(ID) mapped_genes_names = [x.split("|")[0] for x in mapped_genes_list] mapped_genes_names = list(set(mapped_genes_names)) # GOCI-2475 - unique set of names are generated. # Extracting merged rsID: current_rsID = variant_cls.get_current_rsID(ID) # Assign merged rsID and generate title: title = '' if current_rsID: merged_rsID = rsID title = "%s (%s)" %(current_rsID, merged_rsID) else: current_rsID = rsID merged_rsID = '' title = current_rsID # Combining data into a dictionary: varDoc = { 'resourcename' : resourcename, 'id' : "%s:%s" % (resourcename,ID), 'title' : title, 'rsID' : rsID, 'current_rsID' : current_rsID, 'merged_rsID' : merged_rsID, 'associationCount' : association_count, 'studyCount' : study_count, 'mappedGenes' : mapped_genes_list, 'consequence' : consequence, } # Adding only valid location indicated by integer position: if isinstance(location['position'], int): varDoc['chromosomeName'] = location['chromosome'] varDoc['chromosomePosition'] = location['position'] varDoc['region'] = location['region'] # Adding description to the document: coordinates = '%s:%s' %(location['chromosome'], location['position']) genes_str = ",".join(mapped_genes_names) varDoc['description'] = "|".join([coordinates, str(location['region']), consequence,genes_str]) # Adding to document list: all_variant_data.append(varDoc) # Initialize empty list for the documents: all_variant_data = [] # Step 1: initialize variant object: variant_cls = variant_sqls(connection) # Step 2: retrieve all the variants in the database: variants_df = variant_cls.get_snps() # Inintialize progress bar: tqdm.pandas(desc="Returning variant data") # Step 3: Calling apply to retrieve all variant data: if limit != 0: variants_df[0:limit].progress_apply(get_more_variant_data, axis = 1) elif testRun: variants_df[variants_df['ID'].isin(testAssociationId)].progress_apply(get_more_variant_data, axis = 1) else: variants_df.progress_apply(get_more_variant_data, axis = 1) return all_variant_data
wordcount[word] += 1 # function for replace every word in the lemmatized message with the most popular synonim def build_new_sent(sent): list_sent = [] for word in str(sent).split(): word_d = dict() for syn in wordnet.synsets(word): for lemma in syn.lemma_names(): word_d[lemma] = wordcount[lemma] if len(word_d.items()): list_sent.append(max(word_d.items(), key=operator.itemgetter(1))[0]) else: list_sent.append(word) return ' '.join(list_sent) # implement build_new_sent function to the df_train and save results tqdm.pandas("lemma_q1") df_train['lemma_q1_new'] = df_train['lemma_q1'].progress_apply(build_new_sent) tqdm.pandas("lemma_q2") df_train['lemma_q2_new'] = df_train['lemma_q2'].progress_apply(build_new_sent) df_train.to_csv("../middle/train_lemma_stem.csv", index=False) # implement build_new_sent function to the df_test and save results tqdm.pandas("lemma_q1") df_test['lemma_q1_new'] = df_test['lemma_q1'].progress_apply(build_new_sent) tqdm.pandas("lemma_q2") df_test['lemma_q2_new'] = df_test['lemma_q2'].progress_apply(build_new_sent) df_test.to_csv("../middle/test_lemma_stem.csv", index=False)
def main(): # open df_train = pd.read_csv('../middle/train_lemma_stem.csv') df_test = pd.read_csv('../middle/test_lemma_stem.csv') # drop None df_train = df_train.replace(np.nan, ' ', regex=True) df_test = df_test.replace(np.nan, ' ', regex=True) # tuple of input and output columns names name_tuple = [('lemma_q1', 'lemma1doc'), ('lemma_q2', 'lemma2doc'), ('lemma_q1_new', 'lemma1doc_syn'), ('lemma_q2_new', 'lemma2doc_syn')] # apply spacy_doc function for columns from name_tuple to df_train for t in name_tuple: tqdm.pandas(desc=t[0]+'train') df_train[t[1]] = df_train[t[0]].progress_apply(spacy_doc) # find similarity in df_train tqdm.pandas(desc='spacy_similarity_lem+train') df_train['spacy_sim_lem'] = df_train.progress_apply(spacy_similarity_lem, axis=1) tqdm.pandas(desc='spacy_similarity_syn+train') df_train['spacy_sim_syn'] = df_train.progress_apply(spacy_similarity_syn, axis=1) df_train.to_csv('../middle/train_lemma_stem.csv') # apply spacy_doc function for columns from name_tuple to df_test for t in name_tuple: tqdm.pandas(desc=t[0]+'test') df_test[t[1]] = df_test[t[0]].progress_apply(spacy_doc) # find similarity in df_test tqdm.pandas(desc='spacy_similarity_lem+test') df_test['spacy_sim_lem'] = df_test.progress_apply(spacy_similarity_lem, axis=1) tqdm.pandas(desc='spacy_similarity_syn+test') df_test['spacy_sim_syn'] = df_test.progress_apply(spacy_similarity_syn, axis=1) df_test.to_csv('../middle/test_lemma_stem.csv', index=False)
def mapUniProt(geneIndexFile): #fills in the uniProt column of the geneIndex dataframe by calling the uniprot mapping server tqdm.pandas(desc="mapping entrez to uniProt") indexFrame = pd.read_pickle(geneIndexFile) indexFrame['uniProt'] = indexFrame['Entrez_Gene_Id'].progress_apply(lambda entrez: getUniprot(entrez))
# Keywords X # Legend: # X = Complete # V = Currently Void # P = Partially Stabalized # ------------------------------------------------------------------------------------------------------------ # United Kingdom # ------------------------------------------------------------------------------------------------------------ os.chdir(MAIN_FOLDER + "/Data/Governmental_Science_Funding/UK") uk_df = pd.read_csv("uk_funding_data.csv") # Start tqdm tqdm.pandas(desc="status") # Limit to Research Grants uk_df = uk_df[uk_df['ProjectCategory'] == 'Research Grant'].reset_index(drop=True) # Drop unneeded columns to_drop = ['ProjectReference', 'EndDate', 'Status', 'PIOtherNames', 'PI ORCID iD', 'StudentSurname', 'StudentFirstName', 'StudentOtherNames', 'Student ORCID iD', 'GTRProjectUrl',
# 'pos_max': negative sentiment (strongest sentiment statement)', # 'comp_max': composite sentiment (strongest sentiment statement)', # Import dependencies from statistics import mean import pandas as pd from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import sent_tokenize import numpy as np from tqdm import tqdm import collections # load data review = pd.read_csv('C:/Users/adamb/Desktop/vader/yelp_review.csv') tqdm.pandas() # progress bar analyser = SentimentIntensityAnalyzer() # create analyser # Function to get scores def sentiment_analyzer_scores(text): """Calculates sentiment scores for each text review Parameters ---------- text : str Text to analyze Returns ---------- list of float:
from molmaps import distances, calculator, summary import pandas as pd import numpy as np from rdkit import Chem from tqdm import tqdm tqdm.pandas(ascii=True) def caldis(data, idx, tag, methods=['correlation', 'cosine', 'jaccard']): ############################################################## Nf = len(feature.fingerprint.Extraction().bitsinfo) data0 = loadnpy('./data/fingerprint_8206960.npy', N=Nf, dtype=np.bool) groups = data0.sum(axis=1) from sklearn.model_selection import GroupKFold G = GroupKFold(n_splits=10) sp = G.split(X=data0, groups=groups) spl = list(sp) sidx = spl[0][1] del data0 print(len(sidx)) data = data[sidx] data = data.astype(np.float32, copy=False) ############################################################# for method in methods: res = calculator.pairwise_distance(data, n_cpus=16, method=method) res = np.nan_to_num(res, copy=False) df = pd.DataFrame(res, index=idx, columns=idx) df = df.astype('float32')
import random import numpy as np import pandas as pd from nltk.corpus import stopwords from nltk.tokenize.regexp import regexp_tokenize from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report from sklearn.model_selection import train_test_split from tqdm import tqdm tqdm.pandas(desc="progress-bar") import re from gensim.models import Doc2Vec, doc2vec from sklearn import utils from sklearn.utils import shuffle import classifierutils import dataread headers = dataread.read_file('top_sectionheaders_5000.txt') results = [] for header in headers: ret_val = [] for item in header_corpus[header]['labelled_tokenised'].TEXT: logreg.predict([model_dbow.infer_vector(item)])
def apply_preprocessing(df): tqdm.pandas() df['reviewClean_sw'], df['noun_adjective'] = zip( *df['reviewText'].progress_apply(text_preprocessing)) return df
import pandas as pd import numpy as np from tqdm import tqdm df = pd.DataFrame(np.random.randint(0, 100, (100000, 6))) # Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm` # (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.) tqdm.pandas(desc="my bar!") # Now you can use `progress_apply` instead of `apply` # and `progress_map` instead of `map` df.progress_apply(lambda x: x**2) # can also groupby: # df.groupby(0).progress_apply(lambda x: x**2) # -- Source code for `tqdm_pandas` (really simple!) # def tqdm_pandas(t): # from pandas.core.frame import DataFrame # def inner(df, func, *args, **kwargs): # t.total = groups.size // len(groups) # def wrapper(*args, **kwargs): # t.update(1) # return func(*args, **kwargs) # result = df.apply(wrapper, *args, **kwargs) # t.close() # return result # DataFrame.progress_apply = inner
import networkx as nx import numpy as np import pandas as pd from tqdm import tqdm from feature_engineering.tools import lit_eval_nan_proof # this script computes the features authors_citation and coauthor score by considering the graph of coauthorship and # the author's graph of citations. # the script takes approximately 5 minutes to run # progress bar for pandas tqdm.pandas(tqdm()) # path path_to_data = "data/" # loading data converter_dict = { 'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof } nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) nodes.set_index("id", inplace=True) training = pd.read_csv(path_to_data + "training_features.txt") training.set_index("my_index", inplace=True) testing = pd.read_csv(path_to_data + "testing_features.txt") testing.set_index("my_index", inplace=True)
import numpy as np import pandas as pd pd.set_option("display.max_columns",1000) # don’t put … instead of multi columns pd.set_option('expand_frame_repr',False) # for not wrapping columns if you have many from optparse import OptionParser from tqdm import tqdm tqdm.pandas() from sys import platform import plotly as py import cufflinks import sys sys.path.append('../') sys.path.append('../../') sys.path.append('../../../') import int_force if __name__ == '__main__': timer=int_force.global_imports.timer() help_text=''' examples: seq 0 40 |xargs -I ^ echo python3 "%prog" -s ^ \& sbatch --mem=1800m -c1 --time=0:50:0 --array=0-399 --wrap 'python3 %prog -s ${SLURM_JOB_ID}_${SLURM_ARRAY_TASK_ID}' sbatch --mem=1800m -c1 --time=0:50:0 --array=0-199 --wrap 'python3 %prog -s ${SLURM_JOB_ID}_${SLURM_ARRAY_TASK_ID} -q "[0,3,150]" -b "[5,17,19,10001]" -m 15' ''' parser = OptionParser(usage=help_text, version="%prog 1.0 beta") parser.add_option("-n", dest="samples", type="int", default=1000, help='number of dots X2 because you have x and y. for example 1000. you better use 5 [default: %default]') parser.add_option("-s", dest="split_id", type="str", default='0', help='the split unique id so it will not override old output [default: %default]')
from timeit import default_timer as timer import pandas as pd import numpy as np from umap import UMAP from ivis import Ivis from evaluation import Doc2VecModel from tqdm import tqdm from pyod.models.ocsvm import OCSVM from pyod.models.hbos import HBOS from pyod.models.pca import PCA from itertools import permutations from utils import next_path, product_dict, get_scores, reject_outliers, sample_data, remove_short_texts tqdm.pandas(desc="progess: ") class IQROutlier: def __init__(self, contamination=0.1): self.contamination = contamination def fit(self, X, y=None): pcnt = self.contamination / 2 qlow, self.median, qhigh = np.quantile(X, [pcnt, 0.50, 1-pcnt]) self.iqr = qhigh - qlow return self def transform(self, X, thresh_factor=1.0): iqr = self.iqr*thresh_factor preds = ((np.abs(X - self.median)) >= iqr/2)