def main(): """ Makes top 368 clients to all guards usability table. """ paths_filename = "as_paths.txt" index_filename = "as_paths_index.bin" libspookyhash_filename = "./libspookyhash.so" ip_to_as = json.load(open("../guard_info/ip_to_as.json")) guard_to_bw = pickle.load(open("../guard_info/guard_to_bw.pickle", "rb")) fp_to_as = { g.fingerprint: ip_to_as[g.address] for (g, bw) in guard_to_bw.items() } pfi_instance = pfi.PFI(libspookyhash_filename, paths_filename, index_filename) pfi_instance.load() pfi_instance.verify() client_as_lst = [ asn.strip() for asn in open("../data/top368client.txt", 'r').readlines() ] client_to_guard_usability = {} for client_as in client_as_lst: print(f'{client_as}') usability_table = denasa.make_guard_usability_dict( client_as, fp_to_as, pfi_instance) client_to_guard_usability[client_as] = usability_table json.dump(client_to_guard_usability, open("client_to_guard_usability.json", "w"))
import numpy as np import argparse import pfi import json import pickle import copy import relays import matplotlib.pyplot as plt # initialize vars paths_filename = "as_paths.txt" index_filename = "as_paths_index.bin" libspookyhash_filename = "../denasa/libspookyhash.so" pfi_instance = pfi.PFI(libspookyhash_filename, paths_filename, index_filename) pfi_instance.load() pfi_instance.verify() ip_to_as = json.load(open("../guard_info/ip_to_as.json")) all_ases = [ asn.strip() for asn in open("../data/relay_ases.txt", 'r').readlines() ] guard_to_bw = pickle.load(open("../guard_info/guard_to_bw.pickle", "rb")) fp_to_bw = {g.fingerprint: bw for (g, bw) in guard_to_bw.items()} fp_to_as = { g.fingerprint: ip_to_as[g.address] for (g, bw) in guard_to_bw.items()
def run(args): # TODO: log out the args print(args) n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols = args.max_cols # Create necessary dirs OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_ff_cor{corr_th}') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) logger = set_logger( filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log')) # ========== Load data ========== print('\nLoad TC data ...') # ---------- Load data ---------- # y_enc = pd.read_csv(YENC_PATH, sep='\t') ## data = pd.read_csv(DATAPATH, sep='\t') ## xdata = data.iloc[:, 1:].copy() ## ydata = data.iloc[:, 0].copy() data_train = pd.read_csv(DATAPATH_TR, sep=',') data_val = pd.read_csv(DATAPATH_VL, sep=',') print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') mm = pd.read_csv('/vol/ml/apartin/Benchmarks/Data/Pilot1/lincs1000.tsv', sep='\t') train = train[['case_id', 'cancer_type'] + mm['gdc'].tolist()] # Extract lincs from the whole dataset test = test[['case_id', 'cancer_type'] + mm['gdc'].tolist()] # Extract lincs from the whole dataset print(train.shape) print(test.shape) if args.bootstrap_cols > -1: ## xdata = xdata.sample(n=args.bootstrap_cols, axis=1, random_state=SEED) # Take a subset of cols y_tmp = data_train.iloc[:, 0] x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols, axis=1, random_state=SEED) data_train = pd.concat([y_tmp, x_tmp], axis=1) data_val = data_val[data_train.columns] print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') ##features = xdata.columns ##print('data.shape', data.shape) ##print(data.iloc[:3, :4]) ##print('\nxdata.shape', xdata.shape) ##print('np.unique(ydata)', np.unique(ydata)) ##scaler = StandardScaler() ##xdata = scaler.fit_transform(xdata) ##xdata = pd.DataFrame(xdata, columns=features) ##xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # ========== RF classifier ========== logger.info('RF classifier ...') logger.info('-----------------') # ---------- Get the data ---------- xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') # ---------- Train RF classifier ---------- logger.info(f'Train RF Classifier ...') rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) logger.info( f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) print('true', yvl[:10].values) print('pred', yvl_preds[:10]) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) # TODO: finish this ... # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values, # title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion') # ---------- Feature importance ---------- # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, max_cols=max_cols, title='RF Classifier (FI using MDI)') rf_fi = utils.get_rf_fi(rf_model, columns=features) rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False) fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight') # PFI logger.info('Compute PFI ...') t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols=max_cols, title='RF Classifier (PFI var)', ylabel='Importance (relative)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols=max_cols, title='RF Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_rf') # ========== NN classifier ========== logger.info(' ') logger.info('NN classifier ...') logger.info('-----------------') # ---------- Get the data ---------- xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') n_classes = len(np.unique(ytr)) ytr = keras.utils.to_categorical(ytr, num_classes=n_classes) yvl = keras.utils.to_categorical(yvl, num_classes=n_classes) # ---------- Train NN classifier ---------- logger.info('Training NN Classifier...') keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) history = keras_model.fit(xtr, ytr, epochs=epoch, batch_size=batch, verbose=0) # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True, # img_name=os.path.join(OUTDIR, 'learning_with_lr')) score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss logger.info('Prediction score (val loss): {:.4f}'.format(score)) yvl_preds = keras_model.predict(xvl) print('true', np.argmax(yvl[:10], axis=1)) print('pred', np.argmax(yvl_preds[:10, :], axis=1)) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # ---------- Feature importance ---------- # PFI logger.info('Compute PFI ...') t0 = time.time() fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols=max_cols, title='NN Classifier (PFI var)', ylabel='Importance (relative)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols=max_cols, title='NN Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
def run(args): # TODO: log out the args print(f'\n{args}') n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols_plot = args.max_cols_plot # Create necessary dirs # dataset = DATAPATH_TR.split('_')[-1] # TODO: clean/fix OUTDIR = os.path.join(file_path, f'results_{APP}_cor{corr_th}') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) logger = set_logger( filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log')) # ========== Load data ========== print('\n======= Load TC data =======') y_enc = pd.read_csv(YENC_PATH, sep='\t') data_train = pd.read_csv(DATAPATH_TR, sep='\t') data_val = pd.read_csv(DATAPATH_VL, sep='\t') print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') if args.bootstrap_cols > -1: y_tmp = data_train.iloc[:, 0] x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols, axis=1, random_state=SEED) data_train = pd.concat([y_tmp, x_tmp], axis=1) data_val = data_val[data_train.columns] print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') # Compute corr matrix # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) # fig = utils.plot_cor_heatmap(cor) # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight') # # k-fold scheme # kfolds = 5 # if kfolds == 1: # skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED) # else: # skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED) # # Run k-fold CV # best_model = None # best_model_id = 0 # best_score = 0 # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro']) # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)): # print(f'\nFold {f + 1}/{kfolds} ...\n') # print('train_idx', train_idx) # print('val_idx', val_idx) # # Split data # xtr, xvl = xdata[train_idx], xdata[val_idx] # ytr, yvl = ydata[train_idx], ydata[val_idx] # rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED) # min_samples_split=3, # rf_model.fit(xtr, ytr) # score = rf_model.score(xvl, yvl) # print(f'Prediction score (mean accuracy): {score:.4f}') # yvl_preds = rf_model.predict(xvl) # print('true', yvl[:7]) # print('pred', yvl_preds[:7]) # print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}') # print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}') # tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds}) # tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False) # # Plot feature importance # indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier') # fi = utils.get_rf_fi(rf_model) # fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False) # fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight') # # Compute scores # df_scores.loc[f, 'kfold'] = f + 1 # df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro') # df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro') # # Save best model # ## if val_scores.iloc[f, 0] < best_score: # if best_score < df_scores.loc[f, 'f1_micro']: # best_score = df_scores.loc[f, 'f1_micro'] # best_model = rf_model # best_model_id = f # print(df_scores) # model = best_model # ========== RF classifier ========== logger.info('------- Data for RF Classifier -------') xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy().values xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy().values features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') # ---------- Train RF classifier ---------- logger.info('------- Train RF Classifier -------') rf_model = RandomForestClassifier(n_estimators=200, min_samples_leaf=5, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) logger.info( f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) #print('true', yvl[:10].values) print('true', yvl[:10]) print('pred', yvl_preds[:10]) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['label'].values, title=f'{APP}_confusion_rf', savefig=True, img_name=os.path.join( OUTDIR, f'{APP}_confusion_rf.png')) # ---------- MDI and PFI from RF ---------- print('\n------- MDI and PFI from RF classifier -------') # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, max_cols_plot=max_cols_plot, title='RF Classifier (FI using MDI)', errorbars=False, plot_direction='v', color='darkorange') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight') rf_fi = utils.get_rf_fi(rf_model, columns=features) rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False) # PFI t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, y_enc=y_enc, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot, title='RF Classifier (PFI var)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot, title='RF Classifier (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'), bbox_inches='tight') fig = fi_obj.plot_fimap(figsize=(20, 7), n_top_cols=10, title='RF PFI Map', drop_correlated=True) fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_map.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_rf') # ========== NN classifier ========== logger.info(' ') logger.info('------- Data for NN Classifier -------') xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') n_classes = len(np.unique(ytr)) ytr = keras.utils.to_categorical(ytr, num_classes=n_classes) yvl = keras.utils.to_categorical(yvl, num_classes=n_classes) # ---------- Train NN classifier ---------- logger.info('------- Train NN Classifier -------') keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) # callback_list = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, # mode='auto', baseline=None, restore_best_weights=True)] callback_list = [ ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0), ModelCheckpoint(filepath=os.path.join(OUTDIR, f'{APP}_nn_model'), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) ] history = keras_model.fit(xtr, ytr, validation_data=(xvl, yvl), epochs=epoch, batch_size=batch, verbose=1, callbacks=callback_list) # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True, # img_name=os.path.join(OUTDIR, 'learning_with_lr')) score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss logger.info('Prediction score (val loss): {:.4f}'.format(score)) yvl_preds = keras_model.predict(xvl) print('true', np.argmax(yvl[:10], axis=1)) print('pred', np.argmax(yvl_preds[:10, :], axis=1)) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # Reshape taregt (required for confusion matrix and PFI) if yvl_preds.ndim > 1 and yvl_preds.shape[ 1] > 1: # if classification, get the class label yvl_preds = np.argmax(yvl_preds, axis=1) if yvl.ndim > 1 and yvl.shape[ 1] > 1: # if classification, get the class label yvl = np.argmax(yvl, axis=1) utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['label'].values, title=f'{APP}_confusion_nn', savefig=True, img_name=os.path.join( OUTDIR, f'{APP}_confusion_nn.png')) # PFI t0 = time.time() fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, y_enc=y_enc, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot, title='NN Classifier (PFI var)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot, title='NN Classifier (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'), bbox_inches='tight') fig = fi_obj.plot_fimap(figsize=(20, 7), n_top_cols=10, title='NN PFI Map', drop_correlated=True) fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_map.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
def run(): # print(args) n_shuffles = 20 corr_th = 1 # Create necessary dirs OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}_runtime') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # ========== RF classifier ========== print('\nLoad TC data ...') # ---------- Load data ---------- data = pd.read_csv(DATAPATH, sep='\t') xdata = data.iloc[:, 1:].copy() ydata = data.iloc[:, 0].copy() features = xdata.columns scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) print('\nxtr.shape', xtr.shape) print('xvl.shape', xvl.shape) # ---------- Feature importance from RF and PFI ---------- print('\nCompute PFI ...') n_samples = np.linspace(start=int(xvl.shape[0]/4), stop=xvl.shape[0], num=4, dtype=int) n_cols = np.linspace(start=int(xvl.shape[1]/4), stop=xvl.shape[1], num=4, dtype=int) print(n_samples) print(n_cols) tt = pd.DataFrame(index=range(len(n_samples) * len(n_cols)), columns=['n_samples', 'n_cols', 'time (sec)', 'time (min)']) t_run = time.time() cnt = 0 for i, s in enumerate(n_samples): for j, c in enumerate(n_cols): print(f'(n_samples, n_cols): ({s}, {c})') xtr_ = xtr.iloc[:, :c] xvl_ = xvl.iloc[:s, :c] yvl_ = yvl[:s] # print('xtr_.shape', xtr_.shape) # print('xvl_.shape', xvl_.shape) # print('yvl_.shape', yvl_.shape) rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED) rf_model.fit(xtr_, ytr) fi_obj = pfi.PFI(model=rf_model, xdata=xvl_, ydata=yvl_, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False, verbose=False) t0 = time.time() fi_obj.compute_pfi(ml_type='c', verbose=False) t = time.time()-t0 tt.loc[cnt, ['n_samples', 'n_cols', 'time (sec)', 'time (min)']] = np.array([s, c, t, t/60]) cnt += 1 tt.to_csv(os.path.join(OUTDIR, 'tt.csv'), index=False) print(f'\nTotal run time: {(time.time()-t_run)/60} mins')
def run(args): print(args) n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols = args.max_cols # Create necessary dirs OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # ========== RF classifier ========== print('\nLoad NT data ...') # ---------- Load data ---------- data = pd.read_csv(DATAPATH, sep='\t') xdata = data.iloc[:, 1:].copy() ydata = data.iloc[:, 0].copy() if args.bootstrap_cols > -1: xdata = xdata.sample(n=args.bootstrap_cols, axis=1, random_state=SEED) # Take a subset of cols features = xdata.columns print('data.shape', data.shape) print(data.iloc[:3, :4]) print('\nxdata.shape', xdata.shape) print('np.unique(ydata)', np.unique(ydata)) scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # # k-fold scheme # kfolds = 5 # if kfolds == 1: # skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED) # else: # skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED) # # Run k-fold CV # best_model = None # best_model_id = 0 # best_score = 0 # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro']) # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)): # print(f'\nFold {f + 1}/{kfolds} ...\n') # print('train_idx', train_idx) # print('val_idx', val_idx) # # Split data # xtr, xvl = xdata[train_idx], xdata[val_idx] # ytr, yvl = ydata[train_idx], ydata[val_idx] # rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED) # min_samples_split=3, # rf_model.fit(xtr, ytr) # score = rf_model.score(xvl, yvl) # print(f'Prediction score (mean accuracy): {score:.4f}') # yvl_preds = rf_model.predict(xvl) # print('true', yvl[:7]) # print('pred', yvl_preds[:7]) # print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}') # print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}') # tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds}) # tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False) # # Plot feature importance # indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier') # fi = utils.get_rf_fi(rf_model) # fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False) # fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight') # # Compute scores # df_scores.loc[f, 'kfold'] = f + 1 # df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro') # df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro') # # Save best model # ## if val_scores.iloc[f, 0] < best_score: # if best_score < df_scores.loc[f, 'f1_micro']: # best_score = df_scores.loc[f, 'f1_micro'] # best_model = rf_model # best_model_id = f # print(df_scores) # model = best_model # ---------- Train classifier ---------- print('\nTrain RF Classifier ...') rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) print('true', yvl[:10].values) print('pred', yvl_preds[:10]) print('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) print('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) # TODO: finish this ... # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values, # title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion') # Compute corr matrix # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) # fig = utils.plot_cor_heatmap(cor) # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight') # ---------- Feature importance from RF and PFI ---------- # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, max_cols=max_cols, title='RF Classifier (FI using MDI)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight') # PFI print('\nCompute PFI ...') t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c') # logger.info(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols=max_cols, title='RF Classifier (PFI var)', ylabel='Importance (relative)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols=max_cols, title='RF Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}')
def run(args): print(args) n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols = args.max_cols # Create necessary dirs utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # ========== Load classification data ========== print('\n======== Load classification data ========') data_train = pd.read_csv(DATAPATH_CLASSIFICATION_TRAIN, sep='\t') data_val = pd.read_csv(DATAPATH_CLASSIFICATION_VAL, sep='\t') print('data_train.shape', data_train.shape) print('data_val.shape ', data_val.shape) print(f'\ndata_train:\n{data_train.iloc[:3, :4]}') print(f'\ndata_val:\n{data_val.iloc[:3, :4]}') # ========== RF classifier ========== xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') # Compute corr matrix cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) fig = utils.plot_cor_heatmap(cor) fig.savefig(os.path.join(OUTDIR, 'feature_corr_classification.png'), bbox_inches='tight') # ---------- Train classifier ---------- print('\n------- Train RF Classifier -------') rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) print('true', yvl[:5].values) print('pred', yvl_preds[:5]) print('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) print('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) yvl_preds_p = rf_model.predict_proba(xvl) print(f'yvl_preds_p:\n{yvl_preds_p[:5]}') utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=yvl.unique(), title=f'RF Classifier (Confusion)', savefig=True, img_name=os.path.join( OUTDIR, 'rf_classifier_confusion.png')) # ---------- MDI and PFI from RF ---------- print('\n------- MDI and PFI from RF classifier -------') # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, title='RF Classifier (FI using MDI)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight') # PFI t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=False) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(title='RF Classifier (PFI var)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(title='RF Classifier (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi_p(title='RF Classifier (PFI MDA: p-score)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score_p.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name='rf_classifier') # ========== NN classifier ========== # print('\nLoad classification data ...') # ---------- Load data ---------- # data = pd.read_csv(DATAPATH_CLASSIFICATION, sep='\t') # xdata = data.iloc[:, 1:].copy() # ydata = data.iloc[:, 0].copy() # features = xdata.columns # print('data.shape', data.shape) # print(data.iloc[:3, :4]) # print('\nxdata.shape', xdata.shape) # print('np.unique(ydata)', np.unique(ydata)) # n_classes = len(np.unique(ydata)) # ydata = keras.utils.to_categorical(ydata, num_classes=n_classes) # scaler = StandardScaler() # xdata = scaler.fit_transform(xdata) # xdata = pd.DataFrame(xdata, columns=features) # xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # n_classes = len(np.unique(ydata)) # ytr = keras.utils.to_categorical(ytr, num_classes=n_classes) # yvl = keras.utils.to_categorical(yvl, num_classes=n_classes) # print('\nTrain NN Classifier ...') # keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) # history = keras_model.fit(xtr, ytr, epochs=epoch, batch_size=batch, verbose=0) # score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss # print(f'Prediction score (val loss): {score:.4f}') # yvl_preds = keras_model.predict(xvl) # print('true', np.argmax(yvl[:10], axis=1)) # print('pred', np.argmax(yvl_preds[:10, :], axis=1)) # print('f1_score micro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) # print('f1_score macro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # # ---------- Feature importance from RF and PFI ---------- # # PFI # print('\nCompute PFI (NN classifier) ...') # t0 = time.time() # fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles) # fi_obj.gen_col_sets(th=corr_th, toplot=False) # fi_obj.compute_pfi(ml_type='c', verbose=False) # print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # # Plot and save PFI # fig = fi_obj.plot_var_fi(title='NN Classifier (PFI var)', ylabel='Importance (relative)') # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_var.png'), bbox_inches='tight') # fig = fi_obj.plot_score_fi(title='NN Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_score.png'), bbox_inches='tight') # # Dump resutls # fi_obj.dump(path=OUTDIR, name='nn_classifier') # ========== Load regression data ========== print('\n======== Load regression data ========') data_train = pd.read_csv(DATAPATH_REGRESSION_TRAIN, sep='\t') data_val = pd.read_csv(DATAPATH_REGRESSION_VAL, sep='\t') print('data_train.shape', data_train.shape) print('data_val.shape ', data_val.shape) print(f'\ndata_train:\n{data_train.iloc[:3, :4]}') print(f'\ndata_val:\n{data_val.iloc[:3, :4]}') # ========== RF regressor ========== xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') # Compute corr matrix cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) fig = utils.plot_cor_heatmap(cor) fig.savefig(os.path.join(OUTDIR, 'feature_corr_regression.png'), bbox_inches='tight') # ---------- Train regressor ---------- print('\n------- Train RF Regressor -------') rf_model = RandomForestRegressor(n_estimators=150, min_samples_leaf=5, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) score = rf_model.score(xvl, yvl) print(f'Prediction score (r_square): {score:.4f}') # ---------- Feature importance from RF and PFI ---------- print('\n------- MDI and PFI from RF regressor -------') # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, title='RF Regressor (FI using MDI)') fig.savefig(os.path.join(OUTDIR, 'rf_regressor_fi.png'), bbox_inches='tight') # PFI t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='r', verbose=False) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(title='RF Regressor (PFI var)') fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(title='RF Regressor (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name='rf_regressor')