def run(): # Create necessary dirs utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # Load data print('Loading NT data ...') nt = pd.read_csv(os.path.join(DATAPATH, FILENAME), sep=',') mm = pd.read_csv(os.path.join(DATAPATH, GENEMAPFILE), sep='\t') # Extract lincs cols from the whole dataset nt = nt[['Sample'] + mm['gdc'].tolist()] # Map lincs gene names and sort genes alphabetically col_mapping = { mm.loc[g, 'gdc']: mm.loc[g, 'symbol'] for g in range(mm.shape[0]) } nt = nt.rename(columns=col_mapping) nt = nt[['Sample'] + sorted(nt.columns[1:].tolist())] # Shuffle and extract the target label nt = nt.sample(n=nt.shape[0], axis=0, replace=False, random_state=SEED).reset_index(drop=True) nt['Sample'] = nt['Sample'].map(lambda s: s.split('-')[-1]).values nt.rename(columns={'Sample': 'y'}, inplace=True) print(nt['y'].value_counts()) nt.to_csv(os.path.join(OUTDIR, f'{APP}_data'), sep='\t', index=False)
def run(args): print(args) datapath = args.datapath modeldir = args.modeldir tempdir = args.tempdir n_shuffles = args.n_shuffles # col_set = args.col_set col_set = ['C', 'F'] # Create necessary dirs utils.make_dir(tempdir) # os.makedirs(tempdir, exist_ok=True) # Load data data = pd.read_csv(datapath, sep=None) # ydata = data.iloc[:, 0].values xdata = data.iloc[:, 1:].values features = xdata.columns # # Scale data # scaler = StandardScaler() # xdata = scaler.fit_transform(xdata) # xdata = pd.DataFrame(xdata, columns=features) # Load trained keras model # (no need to compile the model for inference) model_name = 'keras_model.json' weights_name = 'keras_weights.h5' model_path = os.path.join(modeldir, model_name) weights_path = os.path.join(modeldir, weights_name) print('\nLoading model from ... {}'.format(model_path)) with open(model_path, 'r') as json_file: model = json_file.read() keras_model = model_from_json(model) keras_model.name = 'trained_model' # Load weights into new model print('Loading weights from ... {}\n'.format(weights_path)) keras_model.load_weights(weights_path) # Shuffle and predict # pred_df = infer_with_col_shuffle_multi(model=keras_model, xdata=xdata, col=col, n_shuffles=n_shuffles) pred_df = pfi_utils.shuf_and_pred_multi(model=keras_model, xdata=xdata, col_set=col_set, n_shuffles=n_shuffles) # ## ---------------------------------------------------------------------- # pred_df = pd.DataFrame(index=range(len(xdata)), columns=range(n_shuffles)) # for s in range(n_shuffles): # # Execute infer code # preds = infer_with_col_shuffle(model, xdata, col) # pred_df.iloc[:, s] = preds # ## ---------------------------------------------------------------------- # Write out results # pred_df.to_csv(os.path.join(tempdir, 'col.' + str(col) + '.csv'), sep='\t', index=False) pred_df.to_csv(os.path.join(tempdir, 'col.' + str('-'.join(col_set)) + '.csv'), sep='\t', index=False)
def run(args): # TODO: log out the args print(args) n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols = args.max_cols # Create necessary dirs OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_ff_cor{corr_th}') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) logger = set_logger( filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log')) # ========== Load data ========== print('\nLoad TC data ...') # ---------- Load data ---------- # y_enc = pd.read_csv(YENC_PATH, sep='\t') ## data = pd.read_csv(DATAPATH, sep='\t') ## xdata = data.iloc[:, 1:].copy() ## ydata = data.iloc[:, 0].copy() data_train = pd.read_csv(DATAPATH_TR, sep=',') data_val = pd.read_csv(DATAPATH_VL, sep=',') print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') mm = pd.read_csv('/vol/ml/apartin/Benchmarks/Data/Pilot1/lincs1000.tsv', sep='\t') train = train[['case_id', 'cancer_type'] + mm['gdc'].tolist()] # Extract lincs from the whole dataset test = test[['case_id', 'cancer_type'] + mm['gdc'].tolist()] # Extract lincs from the whole dataset print(train.shape) print(test.shape) if args.bootstrap_cols > -1: ## xdata = xdata.sample(n=args.bootstrap_cols, axis=1, random_state=SEED) # Take a subset of cols y_tmp = data_train.iloc[:, 0] x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols, axis=1, random_state=SEED) data_train = pd.concat([y_tmp, x_tmp], axis=1) data_val = data_val[data_train.columns] print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') ##features = xdata.columns ##print('data.shape', data.shape) ##print(data.iloc[:3, :4]) ##print('\nxdata.shape', xdata.shape) ##print('np.unique(ydata)', np.unique(ydata)) ##scaler = StandardScaler() ##xdata = scaler.fit_transform(xdata) ##xdata = pd.DataFrame(xdata, columns=features) ##xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # ========== RF classifier ========== logger.info('RF classifier ...') logger.info('-----------------') # ---------- Get the data ---------- xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') # ---------- Train RF classifier ---------- logger.info(f'Train RF Classifier ...') rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) logger.info( f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) print('true', yvl[:10].values) print('pred', yvl_preds[:10]) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) # TODO: finish this ... # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values, # title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion') # ---------- Feature importance ---------- # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, max_cols=max_cols, title='RF Classifier (FI using MDI)') rf_fi = utils.get_rf_fi(rf_model, columns=features) rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False) fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight') # PFI logger.info('Compute PFI ...') t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols=max_cols, title='RF Classifier (PFI var)', ylabel='Importance (relative)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols=max_cols, title='RF Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_rf') # ========== NN classifier ========== logger.info(' ') logger.info('NN classifier ...') logger.info('-----------------') # ---------- Get the data ---------- xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') n_classes = len(np.unique(ytr)) ytr = keras.utils.to_categorical(ytr, num_classes=n_classes) yvl = keras.utils.to_categorical(yvl, num_classes=n_classes) # ---------- Train NN classifier ---------- logger.info('Training NN Classifier...') keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) history = keras_model.fit(xtr, ytr, epochs=epoch, batch_size=batch, verbose=0) # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True, # img_name=os.path.join(OUTDIR, 'learning_with_lr')) score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss logger.info('Prediction score (val loss): {:.4f}'.format(score)) yvl_preds = keras_model.predict(xvl) print('true', np.argmax(yvl[:10], axis=1)) print('pred', np.argmax(yvl_preds[:10, :], axis=1)) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # ---------- Feature importance ---------- # PFI logger.info('Compute PFI ...') t0 = time.time() fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols=max_cols, title='NN Classifier (PFI var)', ylabel='Importance (relative)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols=max_cols, title='NN Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
def run(args): print(args) datapath = args.datapath modeldir = args.modeldir tempdir = args.tempdir n_shuffles = args.n_shuffles corr_th = CORR_THRES # Create necessary dirs utils.make_dir(TEMPDIR) # os.makedirs(TEMPDIR, exist_ok=True) # python 3 utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # python 3 # ======= Load dataset ======= data = pd.read_csv(datapath, sep='\t') xdata = data.iloc[:, 1:].copy() ydata = data.iloc[:, 0].copy() features = xdata.columns print('data.shape', data.shape) print(data.iloc[:3, :4]) print('np.unique(ydata)', np.unique(ydata)) # Scale data scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) # Split data if 'classification' in datapath.split(os.sep)[-1]: print('classification') xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, stratify=ydata) elif 'regression' in datapath.split(os.sep)[-1]: print('regression') xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED) # ======= Load trained keras model ======= # (no need to compile the model for inference) model_name = 'keras_model.json' weights_name = 'keras_weights.h5' model_path = os.path.join(modeldir, model_name) weights_path = os.path.join(modeldir, weights_name) print(f'\nLoading model from ... {model_path}') with open(model_path, 'r') as json_file: model = json_file.read() keras_model = model_from_json(model) keras_model.name = 'trained_model' # Load weights into new model print('\nLoading model from ... {}'.format(model_path)) keras_model.load_weights(weights_path) # ======= Feature importance ======= # Compute correlated features subgroups col_sets = pfi_utils.get_fea_groups(xvl, th=corr_th, toplot=False) if len(col_sets) == 0: col_sets = [[c] for c in xdata.columns.tolist()] else: cols_unq_req = set() # set of unique cols that were requested for col_set in col_sets: # get the unique cols that were passed in col_sets for col in col_set: cols_unq_req.add(col) cols_unq = set(xdata.columns.tolist()) cols_other = cols_unq.difference(cols_unq_req) col_sets = sorted( col_sets, key=len, reverse=True) # sort list based on the length of sublists col_sets.extend([[c] for c in cols_other]) col_sets = col_sets # Create df that stores feature importance fi_var = pd.DataFrame(index=range(len(col_sets)), columns=['cols', 'imp']) fi_score = pd.DataFrame(index=range(len(col_sets)), columns=['cols', 'imp', 'std']) # Iter over col sets (col set per node) print('Iterate over col sets to compute importance ...') for i, col_set in enumerate(col_sets): # pred_df = infer_with_col_shuffle_multi(model=keras_model, xdata=xdata, col=col, n_shuffles=n_shuffles) pred_df = pfi_utils.shuf_and_pred_multi(model=keras_model, xdata=xdata, col_set=col_set, n_shuffles=n_shuffles) # ## ---------------------------------------------------------------------- # pred_df = pd.DataFrame(index=range(xdata.shape[0]), columns=range(n_shuffles)) # for s in range(n_shuffles): # # Execute infer code # # TODO: still need to decide regarding the output ... # preds = shuf_and_pred(model=keras_model, xdata=xdata, col=col) # pred_df.iloc[:, s] = preds # ## ---------------------------------------------------------------------- pred_df.to_csv(os.path.join(tempdir, 'col.' + str('-'.join(col_set)) + '.csv'), sep='\t', index=False) fi_var.loc[i, 'cols'] = ','.join(col_set) # col fi_var.loc[i, 'imp'] = pred_df.var(axis=1).mean() fi_var['imp'] = fi_var['imp'] / fi_var['imp'].sum() fi_var = fi_var.sort_values('imp', ascending=False).reset_index(drop=True) fi_var.to_csv(os.path.join(OUTDIR, 'fi_var.csv'), index=False)
def run(args): print(args) n_samples = args.n_samples n_classes = args.n_classes n_features = args.n_features n_informative = args.n_informative # Create necessary dirs utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # Build classification dataset print('\n======= Generate classification data =======') xdata, ydata = make_classification( n_samples=n_samples, n_classes=n_classes, n_features=n_features, n_informative=n_informative, n_redundant= N_REDUNDANT, # features generated as random linear combinations of the informative features n_repeated= N_REPEATED, # duplicated features, drawn randomly from the informative and the redundant features shift=None, scale=None, random_state=SEED, shuffle=False) ydata = pd.DataFrame(ydata).rename(columns={0: 'y'}) # xdata = pd.DataFrame(xdata) xdata = pd.DataFrame( xdata, columns=[c for c in string.ascii_uppercase[:xdata.shape[1]]]) data = pd.concat([ydata, xdata], axis=1) data = data.sample(data.shape[0], axis=0, replace=False, random_state=SEED) features = xdata.columns scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=TEST_SIZE, random_state=SEED, shuffle=True, stratify=ydata) data_train = pd.concat([ytr, xtr], axis=1) data_val = pd.concat([yvl, xvl], axis=1) # Sort val data by class label data_val = data_val.sort_values('y', ascending=True).reset_index(drop=True) print('data.shape ', data.shape) print('data_train.shape', data_train.shape) print('data_val.shape ', data_val.shape) print(data.iloc[:3, :4]) if (N_REDUNDANT == 0) and (N_REPEATED == 0): # data.to_csv(os.path.join(OUTDIR, 'data_classification'), sep='\t', float_format=np.float16, index=False) data_train.to_csv(os.path.join(OUTDIR, 'data_classification_train'), sep='\t', float_format=np.float16, index=False) data_val.to_csv(os.path.join(OUTDIR, 'data_classification_val'), sep='\t', float_format=np.float16, index=False) else: # data.to_csv(os.path.join(OUTDIR, 'data_classification_corr'), sep='\t', float_format=np.float16, index=False) data_train.to_csv(os.path.join(OUTDIR, 'data_classification_corr_train'), sep='\t', float_format=np.float16, index=False) data_val.to_csv(os.path.join(OUTDIR, 'data_classification_corr_val'), sep='\t', float_format=np.float16, index=False) # Build regression dataset print('\n======= Generate regression data =======') xdata, ydata = make_regression(n_samples=n_samples, n_targets=1, n_features=n_features, n_informative=n_informative, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, coef=False, random_state=SEED, shuffle=True) ydata = pd.DataFrame(ydata).rename(columns={0: 'y'}) # xdata = pd.DataFrame(xdata) xdata = pd.DataFrame( xdata, columns=[c for c in string.ascii_uppercase[:xdata.shape[1]]]) data = pd.concat([ydata, xdata], axis=1) data = data.sample(data.shape[0], replace=False, random_state=SEED) features = xdata.columns scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True) data_train = pd.concat([ytr, xtr], axis=1) data_val = pd.concat([yvl, xvl], axis=1) print('data.shape ', data.shape) print('data_train.shape', data_train.shape) print('data_val.shape ', data_val.shape) print(data.iloc[:3, :4]) if (N_REDUNDANT == 0) and (N_REPEATED == 0): # data.to_csv(os.path.join(OUTDIR, 'data_regression'), sep='\t', float_format=np.float16, index=False) data_train.to_csv(os.path.join(OUTDIR, 'data_regression_train'), sep='\t', float_format=np.float16, index=False) data_val.to_csv(os.path.join(OUTDIR, 'data_regression_val'), sep='\t', float_format=np.float16, index=False) else: # data.to_csv(os.path.join(OUTDIR, 'data_regression_corr'), sep='\t', float_format=np.float16, index=False) data_train.to_csv(os.path.join(OUTDIR, 'data_regression_corr_train'), sep='\t', float_format=np.float16, index=False) data_val.to_csv(os.path.join(OUTDIR, 'data_regression_corr_val'), sep='\t', float_format=np.float16, index=False)
def run(args): # TODO: log out the args print(f'\n{args}') n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols_plot = args.max_cols_plot # Create necessary dirs # dataset = DATAPATH_TR.split('_')[-1] # TODO: clean/fix OUTDIR = os.path.join(file_path, f'results_{APP}_cor{corr_th}') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) logger = set_logger( filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log')) # ========== Load data ========== print('\n======= Load TC data =======') y_enc = pd.read_csv(YENC_PATH, sep='\t') data_train = pd.read_csv(DATAPATH_TR, sep='\t') data_val = pd.read_csv(DATAPATH_VL, sep='\t') print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') if args.bootstrap_cols > -1: y_tmp = data_train.iloc[:, 0] x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols, axis=1, random_state=SEED) data_train = pd.concat([y_tmp, x_tmp], axis=1) data_val = data_val[data_train.columns] print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') # Compute corr matrix # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) # fig = utils.plot_cor_heatmap(cor) # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight') # # k-fold scheme # kfolds = 5 # if kfolds == 1: # skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED) # else: # skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED) # # Run k-fold CV # best_model = None # best_model_id = 0 # best_score = 0 # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro']) # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)): # print(f'\nFold {f + 1}/{kfolds} ...\n') # print('train_idx', train_idx) # print('val_idx', val_idx) # # Split data # xtr, xvl = xdata[train_idx], xdata[val_idx] # ytr, yvl = ydata[train_idx], ydata[val_idx] # rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED) # min_samples_split=3, # rf_model.fit(xtr, ytr) # score = rf_model.score(xvl, yvl) # print(f'Prediction score (mean accuracy): {score:.4f}') # yvl_preds = rf_model.predict(xvl) # print('true', yvl[:7]) # print('pred', yvl_preds[:7]) # print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}') # print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}') # tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds}) # tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False) # # Plot feature importance # indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier') # fi = utils.get_rf_fi(rf_model) # fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False) # fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight') # # Compute scores # df_scores.loc[f, 'kfold'] = f + 1 # df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro') # df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro') # # Save best model # ## if val_scores.iloc[f, 0] < best_score: # if best_score < df_scores.loc[f, 'f1_micro']: # best_score = df_scores.loc[f, 'f1_micro'] # best_model = rf_model # best_model_id = f # print(df_scores) # model = best_model # ========== RF classifier ========== logger.info('------- Data for RF Classifier -------') xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy().values xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy().values features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') # ---------- Train RF classifier ---------- logger.info('------- Train RF Classifier -------') rf_model = RandomForestClassifier(n_estimators=200, min_samples_leaf=5, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) logger.info( f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) #print('true', yvl[:10].values) print('true', yvl[:10]) print('pred', yvl_preds[:10]) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['label'].values, title=f'{APP}_confusion_rf', savefig=True, img_name=os.path.join( OUTDIR, f'{APP}_confusion_rf.png')) # ---------- MDI and PFI from RF ---------- print('\n------- MDI and PFI from RF classifier -------') # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, max_cols_plot=max_cols_plot, title='RF Classifier (FI using MDI)', errorbars=False, plot_direction='v', color='darkorange') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight') rf_fi = utils.get_rf_fi(rf_model, columns=features) rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False) # PFI t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, y_enc=y_enc, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot, title='RF Classifier (PFI var)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot, title='RF Classifier (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'), bbox_inches='tight') fig = fi_obj.plot_fimap(figsize=(20, 7), n_top_cols=10, title='RF PFI Map', drop_correlated=True) fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_map.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_rf') # ========== NN classifier ========== logger.info(' ') logger.info('------- Data for NN Classifier -------') xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') logger.info(f'xtr.shape {xtr.shape}') logger.info(f'xvl.shape {xvl.shape}') logger.info(f'ytr.shape {ytr.shape}') logger.info(f'yvl.shape {yvl.shape}') n_classes = len(np.unique(ytr)) ytr = keras.utils.to_categorical(ytr, num_classes=n_classes) yvl = keras.utils.to_categorical(yvl, num_classes=n_classes) # ---------- Train NN classifier ---------- logger.info('------- Train NN Classifier -------') keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) # callback_list = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, # mode='auto', baseline=None, restore_best_weights=True)] callback_list = [ ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0), ModelCheckpoint(filepath=os.path.join(OUTDIR, f'{APP}_nn_model'), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) ] history = keras_model.fit(xtr, ytr, validation_data=(xvl, yvl), epochs=epoch, batch_size=batch, verbose=1, callbacks=callback_list) # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True, # img_name=os.path.join(OUTDIR, 'learning_with_lr')) score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss logger.info('Prediction score (val loss): {:.4f}'.format(score)) yvl_preds = keras_model.predict(xvl) print('true', np.argmax(yvl[:10], axis=1)) print('pred', np.argmax(yvl_preds[:10, :], axis=1)) logger.info('f1_score micro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) logger.info('f1_score macro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # Reshape taregt (required for confusion matrix and PFI) if yvl_preds.ndim > 1 and yvl_preds.shape[ 1] > 1: # if classification, get the class label yvl_preds = np.argmax(yvl_preds, axis=1) if yvl.ndim > 1 and yvl.shape[ 1] > 1: # if classification, get the class label yvl = np.argmax(yvl, axis=1) utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['label'].values, title=f'{APP}_confusion_nn', savefig=True, img_name=os.path.join( OUTDIR, f'{APP}_confusion_nn.png')) # PFI t0 = time.time() fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, y_enc=y_enc, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=True) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot, title='NN Classifier (PFI var)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot, title='NN Classifier (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'), bbox_inches='tight') fig = fi_obj.plot_fimap(figsize=(20, 7), n_top_cols=10, title='NN PFI Map', drop_correlated=True) fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_map.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
def run(): # print(args) n_shuffles = 20 corr_th = 1 # Create necessary dirs OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}_runtime') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # ========== RF classifier ========== print('\nLoad TC data ...') # ---------- Load data ---------- data = pd.read_csv(DATAPATH, sep='\t') xdata = data.iloc[:, 1:].copy() ydata = data.iloc[:, 0].copy() features = xdata.columns scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) print('\nxtr.shape', xtr.shape) print('xvl.shape', xvl.shape) # ---------- Feature importance from RF and PFI ---------- print('\nCompute PFI ...') n_samples = np.linspace(start=int(xvl.shape[0]/4), stop=xvl.shape[0], num=4, dtype=int) n_cols = np.linspace(start=int(xvl.shape[1]/4), stop=xvl.shape[1], num=4, dtype=int) print(n_samples) print(n_cols) tt = pd.DataFrame(index=range(len(n_samples) * len(n_cols)), columns=['n_samples', 'n_cols', 'time (sec)', 'time (min)']) t_run = time.time() cnt = 0 for i, s in enumerate(n_samples): for j, c in enumerate(n_cols): print(f'(n_samples, n_cols): ({s}, {c})') xtr_ = xtr.iloc[:, :c] xvl_ = xvl.iloc[:s, :c] yvl_ = yvl[:s] # print('xtr_.shape', xtr_.shape) # print('xvl_.shape', xvl_.shape) # print('yvl_.shape', yvl_.shape) rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED) rf_model.fit(xtr_, ytr) fi_obj = pfi.PFI(model=rf_model, xdata=xvl_, ydata=yvl_, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False, verbose=False) t0 = time.time() fi_obj.compute_pfi(ml_type='c', verbose=False) t = time.time()-t0 tt.loc[cnt, ['n_samples', 'n_cols', 'time (sec)', 'time (min)']] = np.array([s, c, t, t/60]) cnt += 1 tt.to_csv(os.path.join(OUTDIR, 'tt.csv'), index=False) print(f'\nTotal run time: {(time.time()-t_run)/60} mins')
def run(args): print(args) n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols = args.max_cols # Create necessary dirs OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}') utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # ========== RF classifier ========== print('\nLoad NT data ...') # ---------- Load data ---------- data = pd.read_csv(DATAPATH, sep='\t') xdata = data.iloc[:, 1:].copy() ydata = data.iloc[:, 0].copy() if args.bootstrap_cols > -1: xdata = xdata.sample(n=args.bootstrap_cols, axis=1, random_state=SEED) # Take a subset of cols features = xdata.columns print('data.shape', data.shape) print(data.iloc[:3, :4]) print('\nxdata.shape', xdata.shape) print('np.unique(ydata)', np.unique(ydata)) scaler = StandardScaler() xdata = scaler.fit_transform(xdata) xdata = pd.DataFrame(xdata, columns=features) xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # # k-fold scheme # kfolds = 5 # if kfolds == 1: # skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED) # else: # skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED) # # Run k-fold CV # best_model = None # best_model_id = 0 # best_score = 0 # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro']) # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)): # print(f'\nFold {f + 1}/{kfolds} ...\n') # print('train_idx', train_idx) # print('val_idx', val_idx) # # Split data # xtr, xvl = xdata[train_idx], xdata[val_idx] # ytr, yvl = ydata[train_idx], ydata[val_idx] # rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED) # min_samples_split=3, # rf_model.fit(xtr, ytr) # score = rf_model.score(xvl, yvl) # print(f'Prediction score (mean accuracy): {score:.4f}') # yvl_preds = rf_model.predict(xvl) # print('true', yvl[:7]) # print('pred', yvl_preds[:7]) # print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}') # print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}') # tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds}) # tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False) # # Plot feature importance # indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier') # fi = utils.get_rf_fi(rf_model) # fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False) # fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight') # # Compute scores # df_scores.loc[f, 'kfold'] = f + 1 # df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro') # df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro') # # Save best model # ## if val_scores.iloc[f, 0] < best_score: # if best_score < df_scores.loc[f, 'f1_micro']: # best_score = df_scores.loc[f, 'f1_micro'] # best_model = rf_model # best_model_id = f # print(df_scores) # model = best_model # ---------- Train classifier ---------- print('\nTrain RF Classifier ...') rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) print('true', yvl[:10].values) print('pred', yvl_preds[:10]) print('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) print('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) # TODO: finish this ... # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values, # title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion') # Compute corr matrix # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) # fig = utils.plot_cor_heatmap(cor) # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight') # ---------- Feature importance from RF and PFI ---------- # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, max_cols=max_cols, title='RF Classifier (FI using MDI)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight') # PFI print('\nCompute PFI ...') t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c') # logger.info(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(max_cols=max_cols, title='RF Classifier (PFI var)', ylabel='Importance (relative)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(max_cols=max_cols, title='RF Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name=f'{APP}')
def run(args): print(args) n_shuffles = args.n_shuffles corr_th = args.corr_th epoch = args.epoch batch = args.batch max_cols = args.max_cols # Create necessary dirs utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # ========== Load classification data ========== print('\n======== Load classification data ========') data_train = pd.read_csv(DATAPATH_CLASSIFICATION_TRAIN, sep='\t') data_val = pd.read_csv(DATAPATH_CLASSIFICATION_VAL, sep='\t') print('data_train.shape', data_train.shape) print('data_val.shape ', data_val.shape) print(f'\ndata_train:\n{data_train.iloc[:3, :4]}') print(f'\ndata_val:\n{data_val.iloc[:3, :4]}') # ========== RF classifier ========== xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') # Compute corr matrix cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) fig = utils.plot_cor_heatmap(cor) fig.savefig(os.path.join(OUTDIR, 'feature_corr_classification.png'), bbox_inches='tight') # ---------- Train classifier ---------- print('\n------- Train RF Classifier -------') rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}') yvl_preds = rf_model.predict(xvl) print('true', yvl[:5].values) print('pred', yvl_preds[:5]) print('f1_score micro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'))) print('f1_score macro: {:.3f}'.format( f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'))) yvl_preds_p = rf_model.predict_proba(xvl) print(f'yvl_preds_p:\n{yvl_preds_p[:5]}') utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=yvl.unique(), title=f'RF Classifier (Confusion)', savefig=True, img_name=os.path.join( OUTDIR, 'rf_classifier_confusion.png')) # ---------- MDI and PFI from RF ---------- print('\n------- MDI and PFI from RF classifier -------') # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, title='RF Classifier (FI using MDI)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight') # PFI t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='c', verbose=False) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(title='RF Classifier (PFI var)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(title='RF Classifier (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi_p(title='RF Classifier (PFI MDA: p-score)') fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score_p.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name='rf_classifier') # ========== NN classifier ========== # print('\nLoad classification data ...') # ---------- Load data ---------- # data = pd.read_csv(DATAPATH_CLASSIFICATION, sep='\t') # xdata = data.iloc[:, 1:].copy() # ydata = data.iloc[:, 0].copy() # features = xdata.columns # print('data.shape', data.shape) # print(data.iloc[:3, :4]) # print('\nxdata.shape', xdata.shape) # print('np.unique(ydata)', np.unique(ydata)) # n_classes = len(np.unique(ydata)) # ydata = keras.utils.to_categorical(ydata, num_classes=n_classes) # scaler = StandardScaler() # xdata = scaler.fit_transform(xdata) # xdata = pd.DataFrame(xdata, columns=features) # xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # n_classes = len(np.unique(ydata)) # ytr = keras.utils.to_categorical(ytr, num_classes=n_classes) # yvl = keras.utils.to_categorical(yvl, num_classes=n_classes) # print('\nTrain NN Classifier ...') # keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) # history = keras_model.fit(xtr, ytr, epochs=epoch, batch_size=batch, verbose=0) # score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss # print(f'Prediction score (val loss): {score:.4f}') # yvl_preds = keras_model.predict(xvl) # print('true', np.argmax(yvl[:10], axis=1)) # print('pred', np.argmax(yvl_preds[:10, :], axis=1)) # print('f1_score micro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) # print('f1_score macro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # # ---------- Feature importance from RF and PFI ---------- # # PFI # print('\nCompute PFI (NN classifier) ...') # t0 = time.time() # fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles) # fi_obj.gen_col_sets(th=corr_th, toplot=False) # fi_obj.compute_pfi(ml_type='c', verbose=False) # print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # # Plot and save PFI # fig = fi_obj.plot_var_fi(title='NN Classifier (PFI var)', ylabel='Importance (relative)') # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_var.png'), bbox_inches='tight') # fig = fi_obj.plot_score_fi(title='NN Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)') # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_score.png'), bbox_inches='tight') # # Dump resutls # fi_obj.dump(path=OUTDIR, name='nn_classifier') # ========== Load regression data ========== print('\n======== Load regression data ========') data_train = pd.read_csv(DATAPATH_REGRESSION_TRAIN, sep='\t') data_val = pd.read_csv(DATAPATH_REGRESSION_VAL, sep='\t') print('data_train.shape', data_train.shape) print('data_val.shape ', data_val.shape) print(f'\ndata_train:\n{data_train.iloc[:3, :4]}') print(f'\ndata_val:\n{data_val.iloc[:3, :4]}') # ========== RF regressor ========== xtr = data_train.iloc[:, 1:].copy() ytr = data_train.iloc[:, 0].copy() xvl = data_val.iloc[:, 1:].copy() yvl = data_val.iloc[:, 0].copy() features = xtr.columns print(f'\nnp.unique(ytr): {np.unique(ytr)}') # Compute corr matrix cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5) fig = utils.plot_cor_heatmap(cor) fig.savefig(os.path.join(OUTDIR, 'feature_corr_regression.png'), bbox_inches='tight') # ---------- Train regressor ---------- print('\n------- Train RF Regressor -------') rf_model = RandomForestRegressor(n_estimators=150, min_samples_leaf=5, max_features='sqrt', random_state=SEED) rf_model.fit(xtr, ytr) score = rf_model.score(xvl, yvl) print(f'Prediction score (r_square): {score:.4f}') # ---------- Feature importance from RF and PFI ---------- print('\n------- MDI and PFI from RF regressor -------') # Plot RF FI indices, fig = utils.plot_rf_fi(rf_model, columns=features, title='RF Regressor (FI using MDI)') fig.savefig(os.path.join(OUTDIR, 'rf_regressor_fi.png'), bbox_inches='tight') # PFI t0 = time.time() fi_obj = pfi.PFI(model=rf_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles, outdir=OUTDIR) fi_obj.gen_col_sets(th=corr_th, toplot=False) fi_obj.compute_pfi(ml_type='r', verbose=False) print(f'Total PFI time: {(time.time()-t0)/60:.3f} mins') # Plot and save PFI fig = fi_obj.plot_var_fi(title='RF Regressor (PFI var)') fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_var.png'), bbox_inches='tight') fig = fi_obj.plot_score_fi(title='RF Regressor (PFI MDA: f1-score)') fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_score.png'), bbox_inches='tight') # Dump resutls fi_obj.dump(path=OUTDIR, name='rf_regressor')
def run(): # Create necessary dirs utils.make_dir(OUTDIR) # os.makedirs(OUTDIR, exist_ok=True) # Load data dataset = 'raw' df_rna, meta = utils.load_lincs1000(dataset=dataset, sources=['gdc']) # Specify col name of the target variable (cancer type) target_col_name = 'ctype' # Drop small classes min_class_size = 300 df_rna, df_rna_small = utils.drop_samples_on_class_count(df=df_rna, y=meta[target_col_name], min_class_size=min_class_size) df_rna, meta = utils.update_df_and_meta(df_rna, meta, on='Sample') print(f'\n{meta[target_col_name].value_counts()}') # Balance classes class_size = min_class_size df_rna, y_out, dropped_classes = utils.balance_df(df=df_rna, y=meta[target_col_name], class_size=class_size, seed=SEED) df_rna, meta = utils.update_df_and_meta(df_rna, meta, on='Sample') print(f'\n{meta[target_col_name].value_counts()}') print(f'\ndf_rna.shape {df_rna.shape}') print(f'meta.shape {meta.shape}') # Create the class `other` # df_other = df_rna_small.sample(min_class_size, random_state=SEED) # df_rna = pd.concat([df_rna, df_other], axis=0) # df_rna, meta = utils.update_df_and_meta(df_rna, meta, on='Sample') # print(f'df_rna.shape {df_rna.shape}') # print(meta[target_col_name].value_counts()) # Encode target variable ydata = meta['ctype'].values y_enc = LabelEncoder() ydata = y_enc.fit_transform(ydata) y_enc = pd.DataFrame(data={'label': np.arange(0, len(y_enc.classes_)), 'type': y_enc.classes_}) y_enc.to_csv(os.path.join(OUTDIR, f'{APP}_y_enc'), sep='\t', index=False) # Permute data xdata = df_rna.iloc[:, 1:].copy() shuf_idx = np.random.permutation(xdata.shape[0]) xdata = xdata.iloc[shuf_idx].reset_index(drop=True) ydata = pd.Series(ydata[shuf_idx], name='y') features = xdata.columns # Drop low var cols xdata, idx = utils.drop_low_var_cols(xdata, verbose=True) features = xdata.columns # Split train/val xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) # print(xtr.index[:5]) # print(ytr.index[:5]) xtr, ytr = xtr.reset_index(drop=True), ytr.reset_index(drop=True) xvl, yvl = xvl.reset_index(drop=True), yvl.reset_index(drop=True) # Scale data scaler = StandardScaler() xtr = scaler.fit_transform(xtr) xvl = scaler.transform(xvl) xtr = pd.DataFrame(xtr, columns=features) xvl = pd.DataFrame(xvl, columns=features) # print('xtr.var(axis=0).mean()', xtr.var(axis=0).mean()) # print('xvl.var(axis=0).mean()', xvl.var(axis=0).mean()) # Concat # data = pd.concat([pd.DataFrame(ydata), xdata], axis=1) data_train = pd.concat([pd.DataFrame(ytr), xtr], axis=1) data_val = pd.concat([pd.DataFrame(yvl), xvl], axis=1) print(f'\ndata_train.shape {data_train.shape}') print(f'data_val.shape {data_val.shape}') # Save # data.to_csv(os.path.join(OUTDIR, f'{APP}_data'), sep='\t', index=False) data_train.to_csv(os.path.join(OUTDIR, f'{APP}_data_train_{dataset}'), sep='\t', index=False) data_val.to_csv(os.path.join(OUTDIR, f'{APP}_data_val_{dataset}'), sep='\t', index=False)
def run(args): print(args) epochs = args.epochs batch = args.batch # ========== NN classifier ========== print('\nLoading classification dataset...') # Load data data = pd.read_csv(DATAPATH_CLASSIFICATION, sep='\t') print('data.shape', data.shape) print(data.iloc[:3, :5]) xdata = data.iloc[:, 1:].values ydata = data.iloc[:, 0].values print('np.unique(ydata)', np.unique(ydata)) n_classes = len(np.unique(ydata)) ydata = keras.utils.to_categorical(ydata, num_classes=n_classes) # Scale data scaler = StandardScaler() xdata = scaler.fit_transform(xdata) # Split data xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata) print('\nTrain NN Classifier ...') keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes) history = keras_model.fit(xtr, ytr, epochs=epochs, batch_size=batch, verbose=0) score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss print('Prediction score (val loss): {:.4f}'.format(score)) yvl_preds = keras_model.predict(xvl) print('true', np.argmax(yvl[:10], axis=1)) print('pred', np.argmax(yvl_preds[:10, :], axis=1)) print('f1_score micro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro'))) print('f1_score macro: {:.3f}'.format( f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro'))) # Create dir to save the model MODELDIR = os.path.join(file_path, 'keras_model_classifier') utils.make_dir(MODELDIR) # Save initial model print('\nSave keras model (classifier) ...') model_json = keras_model.to_json() model_path = os.path.join(MODELDIR, 'keras_model.json') with open(model_path, 'w') as json_file: json_file.write(model_json) # Save the initialized weights to HDF5 print('Saving keras weights...') weights_path = os.path.join(MODELDIR, 'keras_weights.h5') keras_model.save_weights(weights_path) # ========== NN regressor ========== print('\nLoad regression dataset ...') # Load data data = pd.read_csv(DATAPATH_REGRESSION, sep='\t') print('data.shape', data.shape) print(data.iloc[:3, :5]) ydata = data.iloc[:, 0].values xdata = data.iloc[:, 1:].values # Scale data scaler = StandardScaler() xdata = scaler.fit_transform(xdata) # Split data xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True) print('\nTrain NN Regressor ...') keras_model = create_nn_regressor(n_features=xtr.shape[1]) history = keras_model.fit(xtr, ytr, epochs=epochs, batch_size=batch, verbose=0) score = keras_model.evaluate(xvl, yvl, verbose=False)[-1] # compute the val loss print('Prediction score (val loss): {:.4f}'.format(score)) # Create dir to save the model MODELDIR = os.path.join(file_path, 'keras_model_regressor') utils.make_dir(MODELDIR) # Save initial model print('\nSave keras model (regressor) ...') model_json = keras_model.to_json() model_path = os.path.join(MODELDIR, 'keras_model.json') with open(model_path, 'w') as json_file: json_file.write(model_json) # Save the initialized weights to HDF5 print('Save keras weights ...') weights_path = os.path.join(MODELDIR, 'keras_weights.h5') keras_model.save_weights(weights_path)