def append_to_file(self): start_path = os.getenv("HOME") + '/bigdata/database' if os.path.isdir( os.getenv("HOME") + '/bigdata') else '/bigdata/database' os.chdir(start_path) df_historical_inc = pd.read_csv("allfiles.csv", sep=',') df_historical_trade = pd.read_csv("alltrades.csv", sep=',') df_new_trade = self._load_postgres_data_for_trades() df_new_inc = self._cleanser_for_inc() df_uptaded_trade = pd.concate(df_historical_trade, df_new_trade) df_uptaded_inc = pd.concate(df_historical_inc, df_new_inc) df_uptaded_trade.drop_duplicates(subset="title", keep='first', inplace=True) df_uptaded_inc.drop_duplicates(subset="title", keep='first', inplace=True) df_uptaded_inc.to_csv(start_path + 'allfiles.csv', sep=',', header=True, index=False) df_uptaded_trade.to_csv(start_path + 'alltrades.csv', sep=',', header=True, index=False)
def main(): pd.options.display.max_rows = 999999 # report_attacks = [None]*10 #shift_attacks = pd.read_csv(f'..\\results\\old\\cpc {2}-shifted_XORED_csv.csv') #wc_attacks = pd.read_csv(f'..\\results\\cpc {2}-wc_new_XORED_csv.csv') shift_attacks = pd.read_csv( f'..\\results\\old\\cpc {2}-shifted_XORED_csv.csv')[[ 'original_address', 'address' ]] for cpc in range(3, 11): try: shift_attacks = pd.concate([ shift_attacks, pd.read_csv( f'..\\results\\old\\cpc {cpc}-shifted_XORED_csv.csv')[[ 'original_address', 'address' ]] ], ignore_index=True) #shift_attacks = pd.concat([shift_attacks, pd.read_csv(f'..\\results\\cpc {x}-shifted_XORED_csv.csv')]) #shift_attacks = pd.read_csv(f'..\\results\\cpc {x}-shifted_XORED_csv.csv') #wc_attacks = pd.concat([wc_attacks, pd.read_csv(f'..\\results\\cpc {x}-wc_new_XORED_csv.csv')]) #shift_attacks = shift_attacks.drop_duplicates(['full_write_word', 'full_read_word','write_red_with_ADR','read_red_with_ADR','shift_err_detected']) #shift_attacks.to_csv(f'..\\results\\filtered cpc {x}-shifted_XORED_csv.csv') """wc_attacks = pd.read_csv(f'..\\results\\cpc {x}-wc_new_XORED_csv.csv') wc_attacks = (wc_attacks[wc_attacks['wc fault'] == True]).drop_duplicates(['full_write_word', 'full_read_word','write_red_with_ADR','read_red_with_ADR','wc_new_detected']) wc_attacks.to_csv(f'..\\results\\filtered cpc {x}-wc_new_XORED_csv.csv')""" except: print(f'cpc {cpc} , does not have files..') shift_attacks.to_csv(f'..\\results\\old\\all_shifted_attacks.csv') """
def get_spectra_from_file(file): if file.split('.')[-1].lower() == 'mzml': raw_data = mzml_to_df( file) #returns a dict of dataframes from an mzml file elif (file.split('.')[-1].lower() == 'h5') | (file.split('.')[-1].lower() == 'hdf5') | (file.split('.')[-1].lower() == 'hdf'): raw_data = mgd.df_container_from_metatlas_file( file) #This is used when input is hdf5 file spectra = None if isinstance(raw_data['ms2_pos'], pd.DataFrame) & isinstance( raw_data['ms2_neg'], pd.DataFrame): #it has both pos and neg spectra spectra = pd.concate([ create_msms_dataframe(raw_data['ms2_pos']), create_msms_dataframe(raw_data['ms2_neg']) ]) elif isinstance(raw_data['ms2_pos'], pd.DataFrame): spectra = create_msms_dataframe(raw_data['ms2_pos']) elif isinstance(raw_data['ms2_neg'], pd.DataFrame): spectra = create_msms_dataframe(raw_data['ms2_neg']) else: print('File has no MSMS data.') #, file=sys.stderr) open(make_output_filename(file), 'a').close() #make empty file return spectra
def main(): args = parse_args() #make sure the label hdf5 inputs are matched with prediction hdf5 inputs assert (len(args.labels_hdf5) == len(args.predictions_hdf5)) num_datasets = len(args.labels_hdf5) sample_to_auprc = dict() for i in range(num_datasets): print(args.labels_hdf5[i]) print(args.predictions_hdf5[i]) cur_preds = pd.read_hdf(args.predictions_hdf5[i]) cur_labels = pd.read_hdf(args.labels_hdf5[i]) num_tasks = cur_preds.shape[1] if (num_tasks > 1) and (args.multitask == True): #score all the tasks for cur_task in range(num_tasks): task_labels = cur_labels[cur_task] task_preds = cur_preds[cur_task] cur_subset = pd.concate([task_labels, task_preds], axis=1).dropna() cur_subset.columns = ['labels', 'preds'] task_name = colname_to_task_name[cur_task] cur_sample = args.labels_hdf5[i].strip( '.labels.0') + '.' + task_name cur_auprc = average_precision_score(cur_subset['labels'], cur_subset['preds']) sample_to_auprc[cur_sample] = cur_auprc elif (num_tasks > 1): #get the actual task column for key in task_name_to_colname: if key in args.labels_hdf5[i]: #extract the corresponding column cur_task_colname = task_name_to_colname[key] cur_labels = cur_labels[cur_task_colname] #assert the labels and predictions dataframes are matched assert key in args.predictions_hdf5[i] cur_preds = cur_preds[cur_task_colname] cur_data = pd.concat((cur_preds, cur_labels), axis=1) cur_data = cur_data.dropna() cur_data.columns = ['preds', 'labels'] cur_auprc = average_precision_score( cur_data['labels'], cur_data['preds']) cur_sample = args.labels_hdf5[i].strip('.labels.0') sample_to_auprc[cur_sample] = cur_auprc else: cur_data = pd.concat((cur_preds, cur_labels), axis=1).dropna() cur_data.columns = ['preds', 'labels'] #pdb.set_trace() cur_auprc = average_precision_score(cur_data['labels'], cur_data['preds']) cur_sample = args.labels_hdf5[i].strip('.labels.0') sample_to_auprc[cur_sample] = cur_auprc print(sample_to_auprc) outf = open(args.outf + "/perf.metrics.txt", 'w') outf.write('Dataset\tauPRC\n') for key in sample_to_auprc: outf.write(key + '\t' + str(sample_to_auprc[key]) + '\n') outf.close()
def add_new_country_schedule(filename): """Summary Args: filename (TYPE): Description Returns: TYPE: Description """ df_holidays = reindex_holidays(filename) df_holidays = add_region_id(df_holidays) df_current_school_holidays = pd.read_csv(get_file_path( 'data/school_holidays.csv', fileDir), parse_dates=['date']) df_current_school_holidays = pd.concate( [df_holidays, df_current_school_holidays], axis=0) df_current_school_holidays.to_csv('data/school_holidays.csv')
float) / df_percent['inst_cnt_installed'] # inst_cate_percent df_installed['count'] = np.ones(df_installed.shape[0]) df_group_exist = df_installed.groupby( ['userID', 'appID']).count().rename(columns={ 'count': 'inst_is_installed' }).reset_index() df_group_app = df_installed.groupby('appID').count().rename( columns={ 'userID': 'inst_app_installed' }).reset_index() df_train = pd.read_csv('df_basic_train.csv') df_test = pd.read_csv('df_basic_test.csv') df_result = pd.concate(df_train, df_test) df_result = pd.merge(df_result, df_percent, how='left', on=['userID', 'appCategory']) df_result['inst_cate_percent'].fillna(0.0, inplace=True) # 同类应用比例 df_result['inst_cnt_installed'].fillna(0, inplace=True) df_result = pd.merge(df_result, df_group_exist, how='left', on=['userID', 'appID']) df_result['inst_is_installed'].fillna(0, inplace=True) del df_installed['count']
def main(args: Namespace): ratings = pd.read_feather( os.path.join(args.data_path, args.data_name + '_smaple')) user_num, item_num = ratings.uidx.max() + 1, ratings.iidx.max() + 1 #df = pd.read_feather(os.path.join(args.sim_path, f'{args.prefix}_sim_full.feather')) tr_df = pd.read_feather( os.path.join(args.sim_path, f'{args.prefix}_sim_train.feather')) val_df = pd.read_feather( os.path.join(args.sim_path, f'{args.prefix}_sim_val.feather')) te_df = pd.read_feather( os.path.join(args.sim_path, f'{args.prefix}_sim_test.feather')) if args.tune_mode: tr_df = pd.concate([tr_df, val_df]) te_df = te_df else: tr_df = tr_df te_df = val_df past_hist = tr_df.groupby('uidx').apply(lambda x: set(x.iidx)).to_dict() item_cnt_dict = tr_df.groupby('iidx').count().uidx.to_dict() item_cnt = np.array( [item_cnt_dict.get(iidx, 0) for iidx in range(item_num)]) logger.info(f'test data size: {te_df.shape}') dim = args.dim rel_factor = FactorModel(user_num, item_num, dim) PATH = os.path.join(args.sim_path, f'{args.prefix}_rel.pt') rel_factor.load_state_dict(torch.load(PATH)) rel_factor.eval() train_expo_factor = FactorModel(user_num, item_num, dim) PATH = os.path.join(args.sim_path, f'{args.prefix}_expo.pt') train_expo_factor.load_state_dict(torch.load(PATH)) train_expo_factor.eval() train_expo_factor = NoiseFactor(train_expo_factor, args.dim) train_expo_factor = train_expo_factor.to( torch.device(f'cuda:{args.cuda_idx}')) train_expo_factor.load_state_dict( torch.load(os.path.join(args.sim_path, f'{args.prefix}_expo_noise.pt'))) train_expo_factor.eval() expo_factor = FactorModel(user_num, item_num, dim) PATH = os.path.join(args.sim_path, f'{args.prefix}_expo_bs.pt') expo_factor.load_state_dict(torch.load(PATH)) expo_factor.eval() rating_model = RatingEstimator(user_num, item_num, rel_factor) expo_model = ClassRecommender(user_num, item_num, expo_factor) tr_mat = frame2mat(tr_df, user_num, item_num) val_mat = frame2mat(val_df, user_num, item_num) choices = args.models logging.info(f'Running {choices}') def get_model(model_str, user_num, item_num, factor_num): if model_str == 'mlp': return MLPRecModel(user_num, item_num, factor_num) elif model_str == 'gmf': return FactorModel(user_num, item_num, factor_num) elif model_str == 'ncf': return NCFModel(user_num, item_num, factor_num) else: raise NotImplementedError(f'{model_str} is not implemented') logging.info('-------The Popularity model-------') pop_factor = PopularModel(item_cnt) pop_model = PopRecommender(pop_factor) logger.info('unbiased eval for plian popular model on test') unbiased_eval(user_num, item_num, te_df, pop_model, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) logger.info('-------The SVD model---------') sv = SVDRecommender(tr_mat.shape[0], tr_mat.shape[1], dim) logger.info(f'model with dimension {dim}') sv.fit(tr_mat) logger.info('un-biased eval for SVD model on test') unbiased_eval(user_num, item_num, te_df, sv, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) def complete_experiment(model_str, user_num, item_num, dim): logging.info(f'-------The {model_str} model-------') base_factor = get_model(model_str, user_num=user_num, item_num=item_num, factor_num=dim) base_model = ClassRecommender(user_num, item_num, base_factor) base_model.fit(tr_df, num_epochs=args.epoch, cuda=args.cuda_idx, decay=1e-8, num_neg=args.num_neg, past_hist=past_hist, lr=args.lr) logger.info(f'unbiased eval for {model_str} model on test') unbiased_eval(user_num, item_num, te_df, base_model, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) logging.info(f'-------The {model_str} Pop Adjust model-------') pop_adjust_factor = get_model(model_str, user_num=user_num, item_num=item_num, factor_num=dim) pop_adjust_model = ClassRecommender(user_num, item_num, pop_adjust_factor, pop_factor, expo_thresh=0.1) pop_adjust_model.fit(tr_df, num_epochs=args.epoch, cuda=args.cuda_idx, decay=args.decay, num_neg=args.num_neg, past_hist=past_hist, lr=args.lr) logger.info( f'unbiased eval for adjust {model_str} with popular model on test') unbiased_eval(user_num, item_num, te_df, pop_adjust_model, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) del pop_adjust_factor logging.info(f'-------The {model_str} Mirror Adjust model-------') adjust_factor = get_model(model_str, user_num=user_num, item_num=item_num, factor_num=dim) adjust_model = ClassRecommender(user_num, item_num, adjust_factor, base_factor, expo_thresh=0.1) adjust_model.fit(tr_df, num_epochs=args.epoch, cuda=args.cuda_idx, num_neg=args.num_neg, past_hist=past_hist, decay=args.decay, lr=args.lr) logger.info(f'un-biased eval for {model_str} mirror adjusted model') unbiased_eval(user_num, item_num, te_df, adjust_model, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) del adjust_factor logger.info(f'-------The {model_str} Oracle Adjust model---------') oracle_factor = get_model(model_str, user_num=user_num, item_num=item_num, factor_num=dim) oracle_model = ClassRecommender(user_num, item_num, oracle_factor, train_expo_factor, expo_thresh=0.1, expo_compound=args.p) oracle_model.fit(tr_df, num_epochs=args.epoch, cuda=args.cuda_idx, num_neg=args.num_neg, past_hist=past_hist, decay=args.decay, lr=args.lr) logger.info('un-biased eval for oracle model on test') unbiased_eval(user_num, item_num, te_df, oracle_model, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) del oracle_factor for model_str in choices: if model_str != 'acgan': complete_experiment(model_str, user_num, item_num, dim) if 'acgan' in choices: logger.info('-------The AC GAN model---------') f = get_model(args.f_model, user_num, item_num, dim) g = get_model(args.g_model, user_num, item_num, dim) beta = BetaModel(user_num=user_num, item_num=item_num) f_recommender = ClassRecommender(user_num, item_num, f) g_recommender = ClassRecommender(user_num, item_num, g) g_recommender.fit(tr_df, num_epochs=args.g_round_head, cuda=args.cuda_idx, num_neg=args.num_neg, past_hist=past_hist, decay=args.decay, lr=args.lr) ac_train_v3(f, False, g, False, beta, tr_df, user_num=user_num, item_num=item_num, num_neg=args.num_neg, past_hist=past_hist, val_df=te_df, rating_model=rating_model, expo_model=expo_model, num_epochs=args.epoch, decay=args.decay, cuda_idx=args.cuda_idx, lr=args.lr, g_weight=0.5, expo_compound=args.p, epsilon=args.epsilon) logger.info(f'eval on test with f_model ({args.f_model})') unbiased_eval(user_num, item_num, te_df, f_recommender, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p) logger.info(f'eval on test with g_model ({args.g_model})') unbiased_eval(user_num, item_num, te_df, g_recommender, epsilon=args.epsilon, rel_model=rating_model, past_hist=past_hist, expo_model=expo_model, expo_compound=args.p)
tea_counts = tea_counts.rename(columns = {'id' : 'counts'}) high_earners = df.groupby('category').wage.apply(lambda x: np.percentile(x, 75)).reset_index() df.groupby(['location', 'Day of week'])['Total cales'].mean().reset_index() #pivot table df.pivot(columns = 'column pivot', inex = 'column to be row', values = 'column to be values') #merge df new_df = pd.merge(df1, df2) new_df = df.merge(df1).merge(df3) #concate menu = pd.concate([df1], [df2]) #merge left / right how = left #only left df item will be rept #merge inner / outter df_new = pd.merge(df1, df2, how = 'outer') #merge all lines without losing data 'nan' and 'None' will be filled #merge and change column name pd.merge( orders, customers, left_on = 'customer_id', right_on = 'id',
keras_fold_val = model.predict(X_val_list).ravel() keras_oof[valid_idx] += keras_fold_val / folds.n_splits keras_fold_auc = roc_auc_score(valid_y, keras_fold_val) keras_preds += model.predict(X_test_list).ravel() / folds.n_splits #######################CatBooost######################################## layer_name = 'batch_normalization_2' intermediate_layer_model = Model( inputs=model.input, outputs=model.get_layer(layer_name).output) X_train_k = pd.DataFrame(intermediate_layer_model.predict(X_train_list)) X_val_k = pd.DataFrame(intermediate_layer_model.predict(X_val_list)) X_test_k = pd.DataFrame(intermediate_layer_model.predict(X_test_list)) tempdata = pd.concate((X_train_k, X_val_k), axis=0, ignore_index=True) encode_features += tempdata.values / folds.n_splits # cls.fit(X_train_k, train_y, eval_set=(X_val_k, valid_y),early_stopping_rounds=50,verbose=100,plot=False) # cls_fold_val=cls.predict_proba(X_val_k)[:,1] # cls_oof[valid_idx] += cls_fold_val/folds.n_splits # cls_fold_AUC=roc_auc_score(valid_y, cls_fold_val) # cls_preds +=cls.predict_proba(X_test_k)[:,1]/folds.n_splits ###########################Fold Results######################### print("\n") print('-' * 30) print('Fold {} - Keras_OOF = {}'.format(fold + 1, keras_fold_auc)) # print('Fold {} - CatBoost_OOF = {}'.format(fold + 1,cls_fold_AUC))