def build_distractor_db(): bar = st.progress(0) words = read_df("words.csv") words = words["word"] distractors = [] for i, word in enumerate(words): percent_complete = int(i / len(words) * 100) bar.progress(percent_complete) distractors.append(find_n_closest_words(word)) bar.progress(100) distractors_db = dict() distractors_db["word"] = words distractors_db["distractors"] = distractors distractors_db = pd.DataFrame(distractors_db) st.write(distractors_db) save_df(distractors_db, "distractors.csv")
def extract(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df_hist, on=['session_id'], how='left') df_ori = df_ori.merge(df_sid, on=['session_id'], how='left') print(df_ori.head(10)) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def convert(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df, on=['session_id', 'impressions'], how='left') df_ori['cur_ts_sub_last'] = df_ori['timestamp'] - df_ori['timestamp_x'] df_ori.drop(['timestamp', 'timestamp_x'], axis=1, inplace=True) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def extract(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df, on = ['session_id'], how = 'left') df_ori['last_act_gap'] = df_ori['timestamp'] - df_ori['timestamp_x'] df_ori.drop(['timestamp','timestamp_x'],axis=1,inplace=True) print(df_ori.head(10)) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def gen_tr_click(df): df = df[['session_id', 'reference']].drop_duplicates(subset='session_id', keep='last').reset_index(drop=True) print(df.shape) df = df[pd.notnull(df.reference)].reset_index(drop=True) print(df.shape) utils.save_df(df, config.data + 'm3_tr_click.ftr')
def convert(ori, des, feats): df_ori = utils.load_df(ori) for f in feats: tmp = utils.load_df(config.feat+'m3_' +f) print(f) df_ori = pd.concat([df_ori,tmp.drop(['session_id','impressions'],axis=1)],axis=1) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori,des)
def snapshot(self): """Save a snapshot of the cluster """ if self.add_t_snap: utils.save_df(self.cluster.withColumn("t", lit(float(self.t))), f"t{self.t}", **self.save_params) else: utils.save_df(self.cluster, f"t{self.t}", **self.save_params)
def extract(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df_uid, on=['user_id'], how='left') df_ori = df_ori.merge(df_session, on=['session_id'], how='left') df_ori.drop('user_id', axis=1, inplace=True) print(df_ori.head()) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def extract(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df_meta, on = ['impressions'], how = 'left') df_ori = df_ori.merge(df_feat, on = ['impressions'], how = 'left') df_ori['item_price_div_median'] = df_ori['prices'] / df_ori['impressions_by_prices_median'] df_ori['item_rank_sub_median'] = df_ori['impr_rank'] - df_ori['impressions_by_impr_rank_median'] df_ori.drop(['impr_rank','prices'],axis=1,inplace=True) print(df_ori.head()) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def convert(ori, des, feat): df_ori = utils.load_df(ori) print(df_ori.shape) df_feat = utils.load_df(config.feat + feat) df_ori = df_ori.merge(df_feat, on=['session_id', 'impressions'], how='left') print(df_ori.shape) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def get_path(path, children): files = read_files(path) if path: children.append(html.P(f"files found : {len(files)}")) embedding = Embedding() embs = np.array(embedding.embeddings(files)) matrix = similarity_matrix(embs, embs) index_pair = sort_matrix(matrix) np.save('index_pair.npy', index_pair) df = new_df(files) save_df(df, './files.csv') return children return []
def convert(ori, des, prefix): df_ori = utils.load_df(ori) print(df_ori.shape) for feat in feats: df_feat = utils.load_df(config.model + prefix + '%s.csv' % feat).rename( columns={'target': feat}) df_ori = df_ori.merge(df_feat[['session_id', 'impressions', feat]], on=['session_id', 'impressions'], how='left') print(df_ori.shape) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def convert(ori, des, sample): tr = utils.load_df(ori) print(tr.shape) tr_out = tr[['session_id', 'impressions']] dfs = utils.load_df(sample) dfs['impr_rank'] = dfs.groupby(['session_id', 'step']).cumcount().values print(dfs.head()) tr_out = cate_encoding.cate_num_stat(dfs, tr_out, ['session_id', 'impressions'], 'impr_rank', ['min', 'max', 'median']) tr_out.columns = tr_out.columns.astype(str) print(tr_out.head()) utils.save_df(tr_out, des)
def convert(ori, des, feats): df_ori = utils.load_df(ori) print(df_ori.shape) for feat in feats: df_feat = utils.load_df(config.feat + feat) df_ori = df_ori.merge(df_feat, on=['session_id', 'impressions'], how='left') print(df_ori.shape) del df_feat gc.collect() df_ori = utils.reduce_mem(df_ori) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def convert(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df_last, on=['session_id'], how='left') df_ori['last_item_rank_diff'] = df_ori['impr_rank'] - df_ori[ 'last_item_impr_rank'] df_ori[ 'last_item_price_div'] = df_ori['prices'] / df_ori['last_item_price'] df_ori.drop( ['last_item_impr_rank', 'last_item_price', 'prices', 'impr_rank'], axis=1, inplace=True) df_ori.columns = df_ori.columns.astype(str) print(df_ori.head()) utils.save_df(df_ori, des)
def convert(ori, des, feat): df_ori = utils.load_df(ori) print(df_ori.shape) for c in cols: df_ori = cate_encoding.cate_num_rank(df_ori, ['session_id'], c, ascending=True, show_agg=True) df_ori = df_ori.reset_index(drop=True) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des) utils.save_df( df_ori[[ 'session_id', 'impressions', 'session_id_by_prices_rank', 'session_id_by_ctr_rank', 'session_id_by_last_ts_sub_max_rank' ]], feat)
def display_image(children, nc1, nc2): global SESSION_ID changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0] print("changed_id", changed_id) print("clicks", children) print(f"cccc {nc1,nc2}") if children != []: # Reading files index_pair = np.load('index_pair.npy') files_path = pd.read_csv('./files.csv') # Checking which button is clicked and update Session id update_session(index_pair, files_path, changed_id) # Reading images and filename image1, image2, filename1, filename2 = encoded_images( index_pair, files_path) # Saving updated files datafrmae save_df(files_path) return image1, image2, filename1, filename2 return "", "", "", ""
def get_test_sample(df): df['target'] = (df['reference'] == df['impressions']).astype(int) # drop noisy sample mask = (df.session_id == 'cbe3752713eee') & (df.timestamp == 1541660358) df = df[~mask] df_session = df[['session_id', 'step' ]].drop_duplicates(subset='session_id', keep='last').reset_index(drop=True) df = df_session.merge(df, on=['session_id', 'step'], how='left').reset_index(drop=True) te = df[pd.isnull(df['reference'])].reset_index(drop=True) print(te.shape) tr = df[pd.notnull(df['reference'])].reset_index(drop=True) print(tr.shape) tr.drop(['current_filters', 'reference', 'action_type'], axis=1, inplace=True) te.drop(['current_filters', 'reference', 'action_type', 'target'], axis=1, inplace=True) utils.save_df(te, config.data + 'm3_te.ftr') return tr, te
def extract(sample, ori, feat): nrows = None df = pd.read_csv( sample, nrows=nrows, usecols=['session_id', 'step', 'reference', 'impressions']) print(df.head()) df_ori = utils.load_df(ori) print(df_ori.head()) df = df.merge(df_ori[['session_id', 'step']].drop_duplicates(), on='session_id', how='left') print(df.head()) df = df[df.step_x < df.step_y] tmp = df.drop_duplicates(subset=['session_id', 'step_x']) df_clk = tmp.groupby(['session_id', 'reference'])['step_x'].agg('count').reset_index() print(df_clk.head()) df_clk.rename(columns={ 'reference': 'impressions', 'step_x': 'item_sid_clk_cnt' }, inplace=True) df_impr = df.groupby(['session_id', 'impressions'])['step_x'].agg('count').reset_index() print(df_impr.head()) df_impr.rename(columns={'step_x': 'item_sid_impr_cnt'}, inplace=True) df_out = df_ori[['session_id', 'impressions']] df_out = df_out.merge(df_clk, on=['session_id', 'impressions'], how='left') df_out = df_out.merge(df_impr, on=['session_id', 'impressions'], how='left') print(df_out.head()) df_out.columns = df_out.columns.astype(str) utils.save_df(df_out, feat)
def diag(self): """Save diagnostic information about the cluster energy """ T, U = cluster.calc_T(self.cluster, self.G), cluster.calc_U(self.cluster, self.G) E = T + U if not self.E_initial: self.E_initial = E dE = (E - self.E_initial) / self.E_initial diagInfo = (self.t, E, dE) if self.saveDiag: # necessary to match the spark schema diagInfo = [float(x) for x in diagInfo] df_diag = self.spark.createDataFrame([diagInfo], schema=schemas.diag) utils.save_df(df_diag, f"diag_t{self.t}", **self.save_params) else: print("{: >30} {: >30} {: >30}".format(*("t", "E", "dE"))) print("{: >30} {: >30} {: >30}".format(*diagInfo))
def dump_feat(ori, des): df = utils.load_df(ori) df = df[cols + ['session_id', 'impressions']] df.columns = re_cols + ['session_id', 'impressions'] print(df.shape) utils.save_df(df, des)
import pandas as pd import sys import utils import config import cate_encoding tr = utils.load_df(config.data+'m3_tr.ftr') te = utils.load_df(config.data+'m3_te.ftr') df = pd.concat([tr,te]).reset_index(drop=True) df['dt'] = pd.to_datetime(df['timestamp'], unit='s') df['hour'] = df['dt'].dt.hour cols = ['city','device','platform'] for c in cols: df = cate_encoding.label_encode(df, c) # impr rank df['impr_rank'] = df.groupby(['session_id']).cumcount().values # price statistics by session df = cate_encoding.cate_num_stat(df,df,['session_id'],'prices',['median','std','count']) df['price_sub'] = df['prices'] - df['session_id_by_prices_median'] df['price_div'] = df['prices'] / df['session_id_by_prices_median'] df.drop(['dt'],axis=1,inplace=True) df.columns = df.columns.astype(str) utils.save_df(df[pd.isnull(df['target'])].reset_index(drop=True), config.feat+'m3_te_0.ftr') utils.save_df(df[pd.notnull(df['target'])].reset_index(drop=True),config.feat+'m3_tr_0.ftr')
args = parser.parse_args() """/arguments""" G = 1 TOLERANCE = 1e-04 res = [] data = [ 'c_0500.csv', 'c_0700.csv', 'c_0600.csv', 'c_1000.csv', 'c_0900.csv', 'c_1200.csv', 'c_1100.csv', 'c_1500.csv', 'c_0300.csv', 'c_1800.csv', 'c_1300.csv', 'c_0800.csv', 'c_1700.csv', 'c_0200.csv', 'c_0100.csv', 'c_0400.csv', 'c_0000.csv', 'c_1600.csv', 'c_1400.csv' ] for fname in data: df = utils.load_df(os.path.join(args.input, fname), schema=schemas.clust, part="id") e = cluster.calc_E(df) diff = abs(e - (-0.25)) res.append([ fname, e, -0.25, diff, ]) sc = SparkContext.getOrCreate() res = sc.parallelize(res).toDF(schema=schemas.E_test_res) utils.save_df(res, "E_TEST", args.outputDir, fformat="csv")
for cv in range(5): mask = (df.cv == cv) val_tr = df.loc[mask][['session_id', 'impressions']].drop_duplicates() tra_tr = df.loc[~mask] tmp = tra_tr.groupby('impressions')['click'].agg(['sum', 'count']).reset_index() val_tr = val_tr.merge(tmp, on='impressions', how='left') tr_lis.append(val_tr) tr_ctr = pd.concat(tr_lis, axis=0).reset_index(drop=True) tr_ctr['ctr'] = tr_ctr['sum'] / tr_ctr['count'] te_ctr['ctr'] = te_ctr['sum'] / te_ctr['count'] trs = utils.load_df(config.feat + 'm3_tr_0.ftr') tes = utils.load_df(config.feat + 'm3_te_0.ftr') tr_out = trs[['session_id', 'impressions']] te_out = tes[['session_id', 'impressions']] te_out = te_out.merge(te_ctr.drop(['sum', 'count'], axis=1), on=['impressions'], how='left') tr_out = tr_out.merge(tr_ctr.drop(['sum', 'count'], axis=1), on=['session_id', 'impressions'], how='left') tr_out.columns = tr_out.columns.astype(str) te_out.columns = te_out.columns.astype(str) utils.save_df(tr_out, config.feat + 'm3_tr_ctr.ftr') utils.save_df(te_out, config.feat + 'm3_te_ctr.ftr')
df = pd.concat([tr, te]) trs = utils.load_df(config.feat + 'm3_tr_0.ftr') tes = utils.load_df(config.feat + 'm3_te_0.ftr') tr_out = trs[['session_id', 'impressions']] te_out = tes[['session_id', 'impressions']] for act in actions: tmp = df[df.action_type == act][['reference', 'user_id']] tmp = tmp.groupby(['reference'])['user_id'].agg(['count', 'nunique']).reset_index() tmp.rename(columns={ 'reference': 'impressions', 'count': act + '_pv', 'nunique': act + '_uv' }, inplace=True) tmp['impressions'] = tmp['impressions'].astype(str) num_index = tmp['impressions'].str.isnumeric() tmp = tmp[num_index] tmp['impressions'] = tmp['impressions'].astype('int') print(tmp.head()) tr_out = tr_out.merge(tmp, on=['impressions'], how='left') te_out = te_out.merge(tmp, on=['impressions'], how='left') tr_out.columns = tr_out.columns.astype(str) te_out.columns = te_out.columns.astype(str) utils.save_df(tr_out, config.feat + 'm3_tr_item_act_pv.ftr') utils.save_df(te_out, config.feat + 'm3_te_item_act_pv.ftr')
def convert(df_ori, des, df_out): print(df_ori.shape) df_ori = df_ori.merge(df_out, on=['session_id', 'impressions'], how='left') df_ori.columns = df_ori.columns.astype(str) print(df_ori.head()) utils.save_df(df_ori, des)
def ETL(extractor, components, data_dict, same_dt_aggregator, hdf5_fname=None, joined_path=None, hadm_ids=ALL, use_base_df=True, to_pandas=False, chunksize=500000): logger.log('***ETL***', new_level=True) logger.log('SETUP', new_level=True) category_map = mimic_category_map(data_dict) ureg = units.MedicalUreg() transformer = transform_pipeline() standard_clean_pipeline = Pipeline([ ('aggregate_same_datetime', same_dt_aggregator), ('split_dtype', transformers.split_dtype()), ('standardize_columns', transformers.column_standardizer(data_dict, ureg)), ('standardize_categories', transformers.standardize_categories(data_dict, category_map)), ('split_bad_categories', transformers.split_bad_categories(data_dict)), # ('one_hotter',transformers.nominal_to_onehot()), ('drop_oob_values', transformers.oob_value_remover(data_dict)) ]) should_save = (hdf5_fname is not None) df_base = None if should_save & use_base_df: try: df_base = utils.open_df(hdf5_fname, joined_path) except: pass if df_base is not None: existing_components = df_base.columns.get_level_values( column_names.COMPONENT).unique().tolist() existing_ids = set( df_base.index.get_level_values(column_names.ID).tolist()) requested_ids = hadm_ids if hadm_ids != ALL else get_all_hadm_ids() new_ids = [ID for ID in requested_ids if ID not in existing_ids] #case 1: new ids in existing columns, don't try to be smart with ALL unless not a lot of IDs if len(new_ids) > 0: df_addition = ETL(extractor, existing_components, data_dict, same_dt_aggregator, hadm_ids=new_ids, to_pandas=True) if df_addition is not None: df_base = pd.concat([df_base, df_addition]) #now we only need to load NEW components components = [ comp for comp in components if comp not in existing_components ] logger.log('Base DF to Dask') df_base = dd.from_pandas(df_base.reset_index(), chunksize=chunksize) df_all = df_base logger.log('BEGIN ETL for {} admissions and {} components: {}'.format( hadm_ids if hadm_ids == ALL else len(hadm_ids), len(components), components), new_level=True, end_level=True) for component in components: logger.log('{}: {}/{}'.format(component.upper(), components.index(component) + 1, len(components)), new_level=True) """ @@@@@@@@@@@@@@@ ----EXTRACT---- @@@@@@@@@@@@@@@ """ logger.log("Extracting...", new_level=True) df_extracted = extractor.extract_component(component, hadm_ids) if df_extracted.empty: print 'EMPTY Dataframe EXTRACTED for {}, n={} ids'.format( component, len(hadm_ids)) logger.end_log_level() continue if should_save: logger.log('Save EXTRACTED DF = {}'.format(df_extracted.shape)) utils.save_df(df_extracted, hdf5_fname, 'extracted/{}'.format(component)) logger.end_log_level() """ @@@@@@@@@@@@@@@@@ ----TRANSFORM---- @@@@@@@@@@@@@@@@@ """ logger.log("Transforming... {}".format(df_extracted.shape), new_level=True) transformer.set_params(add_level__level_val=component) df_transformed = transformer.transform(df_extracted) print 'Data Loss (Extract > Transformed):', utils.data_loss( df_extracted.set_index(column_names.ID).value.to_frame(), df_transformed) if df_transformed.empty: print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format( component, len(hadm_ids)) logger.end_log_level() continue if should_save: logger.log('Save TRANSFORMED DF = {}'.format(df_transformed.shape)) utils.save_df(df_transformed, hdf5_fname, 'transformed/{}'.format(component)) logger.end_log_level() """ @@@@@@@@@@@@@@@ -----CLEAN----- @@@@@@@@@@@@@@@ """ logger.log("Cleaning... {}".format(df_transformed.shape), new_level=True) df = standard_clean_pipeline.transform(df_transformed) print 'Data Loss (Extract > Cleaned):', utils.data_loss( df_extracted.set_index(column_names.ID).value.to_frame(), df) if df.empty: print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format( component, len(hadm_ids)) logger.end_log_level() continue if should_save: logger.log('Save CLEANED DF = {}'.format(df.shape)) utils.save_df(df, hdf5_fname, 'cleaned/{}'.format(component)) logger.end_log_level() del df_extracted, df_transformed logger.log('Filter & sort - {}'.format(df.shape)) df.sort_index(inplace=True) df.sort_index(inplace=True, axis=1) logger.log('Convert to dask - {}'.format(df.shape)) df_dask = dd.from_pandas(df.reset_index(), chunksize=chunksize) del df logger.log('Join to big DF') if df_all is None: df_all = df_dask else: df_all = df_all.merge(df_dask, how='outer', on=['id', 'datetime']) del df_dask logger.end_log_level() logger.end_log_level() if df_all is None or not to_pandas: logger.end_log_level() return df_all logger.log('Dask DF back to pandas') df_pd = df_all.compute() del df_all df_pd.set_index(['id', 'datetime'], inplace=True) logger.log('SORT Joined DF') df_pd.sort_index(inplace=True) df_pd.sort_index(inplace=True, axis=1) if should_save: logger.log('SAVE Big DF') utils.save_df(df_pd, hdf5_fname, joined_path) logger.end_log_level() return df_pd
tr.drop(['current_filters', 'reference', 'action_type'], axis=1, inplace=True) te.drop(['current_filters', 'reference', 'action_type', 'target'], axis=1, inplace=True) utils.save_df(te, config.data + 'm3_te.ftr') return tr, te def gen_tr_click(df): df = df[['session_id', 'reference']].drop_duplicates(subset='session_id', keep='last').reset_index(drop=True) print(df.shape) df = df[pd.notnull(df.reference)].reset_index(drop=True) print(df.shape) utils.save_df(df, config.data + 'm3_tr_click.ftr') if __name__ == '__main__': nrow = None train = utils.load_df(config.data + 'sample_train.csv', nrows=nrow) test = utils.load_df(config.data + 'sample_test.csv', nrows=nrow) df = pd.concat([train, test]).reset_index(drop=True) tr1 = gen_train_sample(train) tr2, te = get_test_sample(test) tr = pd.concat([tr1, tr2]).reset_index(drop=True) utils.save_df(tr1, config.data + 'm3_tr.ftr') gen_tr_click(df)