def run(args): # Global outdir gout = Path(args['global_outdir']) os.makedirs(gout, exist_ok=True) # dirpath = verify_dirpath(args['dirpath']) data = read_data_file(filepath / args['filepath'], 'parquet') print('data.shape', data.shape) # Get features (x), target (y), and meta fea_list = args['cell_fea'] + args['drug_fea'] xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_') meta = data.drop(columns=xdata.columns) ydata = meta[[args['target_name']]] del data # ML type ('reg' or 'cls') if 'reg' in args['model_name']: mltype = 'reg' elif 'cls' in args['model_name']: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") # Create logger lg = Logger(gout / 'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') def get_unq_split_ids(all_splits_path): """ List containing the full path of each split. """ unq = [ all_splits_path[i].split(os.sep)[-1].split('_')[1] for i, p in enumerate(all_splits_path) ] # unq = [] # for i, p in enumerate(all_splits_path): # sp = all_splits_path[i].split(os.sep)[-1].split('_')[1] # unq.append(sp) unq = np.unique(unq) return unq all_splits_path = glob(str(Path(args['splitpath']) / '1fold_*_id.csv')) unq_split_ids = get_unq_split_ids(all_splits_path) run_times = [] # Append scores (dicts) tr_scores_all = [] vl_scores_all = [] te_scores_all = [] # Sample size at each run smp_sz = [] file_smp_sz = open(gout / 'sample_sz', 'w') file_smp_sz.write('run\ttr_sz\tvl_sz\tte_sz\n') # Iterate over splits n_splits = None if args['n_splits'] is None else (args['n_splits'] + 1) for i, split_id in enumerate(unq_split_ids[:n_splits]): # print(f'Split {split_id}') # Get indices for the split aa = [p for p in all_splits_path if f'1fold_{split_id}' in p] if len(aa) < 2: print(f'The split {s} contains only one file.') continue for id_file in aa: if 'tr_id' in id_file: tr_id = read_data_file(id_file) # elif 'vl_id' in id_file: # # vl_id = read_data_file( id_file ) # te_id = read_data_file( id_file ) elif 'vl_id' in id_file: vl_id = read_data_file(id_file) elif 'te_id' in id_file: te_id = read_data_file(id_file) # Define run outdir rout = gout / f'run_{split_id}' os.makedirs(rout, exist_ok=True) # Scaling # xdata = scale_fea(xdata=xdata, scaler_name=args['scaler']) # scale features # Get training and val data # Extract Train set T, Validation set V, and Test set E tr_id = tr_id.iloc[:, 0].values.astype(int).tolist() vl_id = vl_id.iloc[:, 0].values.astype(int).tolist() te_id = te_id.iloc[:, 0].values.astype(int).tolist() xtr, ytr, mtr = get_data_by_id( tr_id, xdata, ydata, meta) # samples from xtr are sequentially sampled for TRAIN xvl, yvl, mvl = get_data_by_id( vl_id, xdata, ydata, meta) # fixed set of VAL samples for the current CV split xte, yte, mte = get_data_by_id( te_id, xdata, ydata, meta) # fixed set of TEST samples for the current CV split # Extract val data # from sklearn.model_selection import train_test_split # id_arr = np.arange(len(xtr)) # tr_, vl_ = train_test_split(id_arr, test_size=0.1) # xvl = xtr.iloc[vl_,:].reset_index(drop=True) # xtr = xtr.iloc[tr_,:].reset_index(drop=True) # mvl = mtr.iloc[vl_,:].reset_index(drop=True) # mtr = mtr.iloc[tr_,:].reset_index(drop=True) # yvl = ytr.iloc[vl_].reset_index(drop=True) # ytr = ytr.iloc[tr_].reset_index(drop=True) # Remove AUC gap min_gap = args['min_gap'] max_gap = args['max_gap'] if (min_gap is not None) & (max_gap is not None): idx = (ytr.values > min_gap) & (ytr.values < max_gap) xtr = xtr[~idx] mtr = mtr[~idx] ytr = ytr[~idx] def drop_samples(x_df, y_df, m_df, items_to_drop, drop_by: str): """ Args: drop_by : col in df ('CELL', 'DRUG', 'CTYPE') """ id_drop = m_df[drop_by].isin(items_to_drop) x_df = x_df[~id_drop].reset_index(drop=True) y_df = y_df[~id_drop].reset_index(drop=True) m_df = m_df[~id_drop].reset_index(drop=True) return x_df, y_df, m_df # Dump cell lines # if args['cell_list_drop'] is not None: # cell_to_drop_fpath = Path(args['cell_list_drop']) # cell_to_drop_fname = 'cell_list_tmp' # cell_to_drop_fpath = filepath / cell_to_drop_fname if args['cell_list_drop'] is not None: cell_to_drop_fpath = Path(args['cell_list_drop']) if cell_to_drop_fpath.exists(): # with open(cell_to_drop_fpath, 'r') as f: with open(cell_to_path_fpath, 'r') as f: cells_to_drop = [line.rstrip() for line in f] xtr, ytr, mtr = drop_samples(x_df=xtr, y_df=ytr, m_df=mtr, items_to_drop=cells_to_drop) xvl, yvl, mvl = drop_samples(x_df=xvl, y_df=yvl, m_df=mvl, items_to_drop=cells_to_drop) xte, yte, mte = drop_samples(x_df=xte, y_df=yte, m_df=mte, items_to_drop=cells_to_drop) line = 's{}\t{}\t{}\t{}\n'.format(split_id, xtr.shape[0], xvl.shape[0], xte.shape[0]) file_smp_sz.write(line) # Adjust the responses if mltype == 'cls': ytr = bin_rsp(ytr, resp_thres=0.5) yvl = bin_rsp(yvl, resp_thres=0.5) yte = bin_rsp(yte, resp_thres=0.5) # Define ML model if 'lgb' in args['model_name']: args['framework'] = 'lightgbm' elif args['model_name'] == 'rf_reg': args['framework'] = 'sklearn' elif 'nn_' in args['model_name']: args['framework'] = 'keras' model_init_kwargs, model_fit_kwargs = get_model_kwargs(args) # Get the estimator estimator = ml_models.get_model(args['model_name'], init_kwargs=model_init_kwargs) model = estimator.model # Train eval_set = (xvl, yvl) # eval_set = None if args['framework'] == 'lightgbm': model, runtime = trn_lgbm_model(model=model, xtr=xtr, ytr=ytr, eval_set=eval_set, fit_kwargs=model_fit_kwargs) elif args['framework'] == 'sklearn': model, runtime = trn_sklearn_model(model=model, xtr_sub=xtr, ytr_sub=ytr, eval_set=None, fit_kwargs=model_fit_kwargs) elif args['framework'] == 'keras': model, runtime = trn_keras_model(model=model, xtr_sub=xtr, ytr_sub=ytr, eval_set=eval_set) elif args['framework'] == 'pytorch': pass else: raise ValueError(f'Framework {framework} is not yet supported.') if model is None: continue # sometimes keras fails to train a model (evaluates to nan) # Append runtime run_times.append(runtime) # Dump model if args['save_model']: joblib.dump(model, filename=rout / ('model.' + args['model_name'] + '.pkl')) # Calc preds and scores # ... training set y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_preds(y_true, y_pred, meta=mtr, outpath=rout / 'preds_tr.csv') # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_preds(y_true, y_pred, meta=mvl, outpath=rout / 'preds_vl.csv') # ... test set y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype) te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_preds(y_true, y_pred, meta=mte, outpath=rout / 'preds_te.csv') # Add metadata tr_scores['run'] = split_id vl_scores['run'] = split_id te_scores['run'] = split_id # Append scores (dicts) tr_scores_all.append(tr_scores) vl_scores_all.append(vl_scores) te_scores_all.append(te_scores) # Free space # del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, tr_, vl_ del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, eval_set, model, estimator if i % 10 == 0: print(f'Finished {split_id}') file_smp_sz.close() # Scores to df tr_scores_df = scores_to_df(tr_scores_all) vl_scores_df = scores_to_df(vl_scores_all) te_scores_df = scores_to_df(te_scores_all) tr_scores_df.to_csv(gout / 'tr_scores.csv', index=False) vl_scores_df.to_csv(gout / 'vl_scores.csv', index=False) te_scores_df.to_csv(gout / 'te_scores.csv', index=False) if (time() - t0) // 3600 > 0: lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600)) else: lg.logger.info('Runtime: {:.1f} min'.format((time() - t0) / 60)) del tr_scores_df, vl_scores_df, te_scores_df # -------------------------------------------------------- # Calc stats def reorg_cols(df, col_first: str): """ Args: col_first : col name to put first """ cols = df.columns.tolist() cols.remove(col_first) return df[[col_first] + cols] def agg_preds_from_cls_runs(runs_dirs, phase='_te.csv', verbose=False): """ Aggregate predictions bootstraped ML trainings. """ prd = [] for i, dir_name in enumerate(runs_dirs): if '_tr.csv' in phase: prd_ = pd.read_csv(dir_name / 'preds_tr.csv') elif '_vl.csv' in phase: prd_ = pd.read_csv(dir_name / 'preds_vl.csv') elif '_te.csv' in phase: prd_ = pd.read_csv(dir_name / 'preds_te.csv') # prd_te_['err'] = abs(prd_te_['y_true'] - prd_te_['y_pred']) # add col 'err' prd_['run'] = str(dir_name).split( os.sep)[-1].split('_')[-1] # add col 'run' identifier prd.append(prd_) # append run data if verbose: if i % 20 == 0: print(f'Processing {dir_name}') # Aggregate to df prd = pd.concat(prd, axis=0) # Reorganize cols prd = reorg_cols(prd, col_first='run').sort_values('run').reset_index( drop=True).reset_index().rename(columns={'index': 'idx'}) return prd # Concat preds from all runs runs_dirs = [Path(p) for p in glob(str(gout / 'run_*'))] prd_te_all = agg_preds_from_cls_runs(runs_dirs, phase='_te.csv') if 'source' not in [str(i).lower() for i in prd_te_all.columns.to_list()]: prd_te_all.insert( loc=2, column='SOURCE', value=[s.split('.')[0].lower() for s in prd_te_all['CELL']]) # Cancer types cancer_types = pd.read_csv(filepath / 'data/combined_cancer_types', sep='\t', names=['CELL', 'CTYPE']) # Add CTYPE columns prd_te_all = pd.merge(prd_te_all, cancer_types, on='CELL') prd_te_all = reorg_cols(prd_te_all, col_first='CTYPE') # Rename prd_te_all = prd_te_all.rename(columns={ 'y_true': 'y_true_cls', 'y_pred': 'y_pred_prob' }) # Retain specific columns cols = [ 'idx', 'run', 'SOURCE', 'CTYPE', 'CELL', 'DRUG', 'R2fit', 'AUC', 'y_true_cls', 'y_pred_prob' ] prd_te_all = prd_te_all[cols] # Add col of pred labels prd_te_all['y_pred_cls'] = prd_te_all.y_pred_prob.map(lambda x: 0 if x < 0.5 else 1) # The highest error is 0.5 while the lowest is 0. # This value is proportional to the square root of Brier score. prd_te_all['prob_err'] = abs(prd_te_all.y_true_cls - prd_te_all.y_pred_prob) # Bin AUC values bins = np.arange(0, 1.1, 0.1).tolist() prd_te_all['AUC_bin'] = pd.cut(prd_te_all.AUC, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise') # Add col that cetegorizes the preds prd_te_all['prd_cat'] = None prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1) & (prd_te_all.y_pred_cls == 1)] = 'TP' prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0) & (prd_te_all.y_pred_cls == 0)] = 'TN' prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1) & (prd_te_all.y_pred_cls == 0)] = 'FN' prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0) & (prd_te_all.y_pred_cls == 1)] = 'FP' # Add cols prd_te_all['TP'] = prd_te_all.prd_cat == 'TP' prd_te_all['TN'] = prd_te_all.prd_cat == 'TN' prd_te_all['FP'] = prd_te_all.prd_cat == 'FP' prd_te_all['FN'] = prd_te_all.prd_cat == 'FN' # Save aggregated master table prd_te_all.to_csv('prd_te_all.csv', index=False) # Plot confusion matrix from sklearn.metrics import confusion_matrix # y_true_cls = prd_te_all.y_true_cls # y_pred_cls = prd_te_all.y_pred.map(lambda x: 0 if x<0.5 else 1) y_true_cls = prd_te_all.y_true_cls y_pred_cls = prd_te_all.y_pred_cls np_conf = confusion_matrix(y_true_cls, y_pred_cls) tn, fp, fn, tp = confusion_matrix(y_true_cls, y_pred_cls).ravel() mcc = sklearn.metrics.matthews_corrcoef(y_true_cls, y_pred_cls, sample_weight=None) print('TN:', tn) print('FP:', fp) print('FN:', fn) print('TP:', tp) print('FPR:', fp / (fp + tn)) print('FNR:', fn / (fn + tp)) print('MCC:', mcc) with open(gout / 'scores.txt', 'w') as f: f.write('TN: {:d}\n'.format(tn)) f.write('TN: {:d}\n'.format(tn)) f.write('FP: {:d}\n'.format(fp)) f.write('FN: {:d}\n'.format(fn)) f.write('TP: {:d}\n'.format(tp)) f.write('FPR: {:.5f}\n'.format(fp / (fp + tn))) f.write('FNR: {:.5f}\n'.format(fn / (fn + tp))) f.write('MCC: {:.5f}\n'.format(mcc)) # Confusion Matrix conf = confusion_matrix(y_true_cls, y_pred_cls, normalize=None) conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp']) conf_plot.plot(include_values=True, cmap=plt.cm.Blues, ax=None, xticks_rotation=None, values_format='d') plt.savefig(gout / 'conf_mat.png', dpi=100) # Confusion Matrix (normalized) conf = confusion_matrix(y_true_cls, y_pred_cls, normalize='all') conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp']) conf_plot.plot(include_values=True, cmap=plt.cm.Blues, ax=None, xticks_rotation=None, values_format='.2f') conf_plot.ax_.set_title('Normalized') plt.savefig(gout / 'conf_mat_norm.png', dpi=100) def add_conf_data(data): """ Add columns are used to calc confusion matrix TP, TN, FN, FP. """ data['TP'] = data.apply(lambda row: row.y_pred_cls_1 if row.y_true == 1 else False, axis=1) # tp data['TN'] = data.apply(lambda row: row.y_pred_cls_0 if row.y_true == 0 else False, axis=1) # tn data['FN'] = data.apply(lambda row: row.y_pred_cls_0 if row.y_true == 1 else False, axis=1) # fn data['FP'] = data.apply(lambda row: row.y_pred_cls_1 if row.y_true == 0 else False, axis=1) # fp data['TPR'] = data.apply( lambda row: np.nan if (row.TP == 0) & (row.FN == 0) else row.TP / (row.TP + row.FN), axis=1) # sensitivity, recall: TP/P = TP/(TP+FN) data['TNR'] = data.apply(lambda row: np.nan if (row.TN == 0) & (row.FP == 0) else row.TN / (row.TN + row.FP), axis=1) # specificity: TN/N = TN/(TN+FP) data['FPR'] = data.apply(lambda row: np.nan if (row.TN == 0) & (row.FP == 0) else row.FP / (row.TN + row.FP), axis=1) # fall-out: FP/N = FP/(FP+TN) data['FNR'] = data.apply(lambda row: np.nan if (row.TP == 0) & (row.FN == 0) else row.FN / (row.TP + row.FN), axis=1) # miss-rate: FN/NP = FN/(FN+TP) return data # Summary table prd_te_to_grp = prd_te_all.copy() prd_te_to_grp['y_pred_prob_median'] = prd_te_to_grp.y_pred_prob prd_te_to_grp['y_pred_prob_std'] = prd_te_to_grp.y_pred_prob prd_te_to_grp['y_pred_tot'] = prd_te_to_grp.idx prd_te_to_grp['y_pred_cls_0'] = prd_te_to_grp.y_pred.map( lambda x: True if x < 0.5 else False) prd_te_to_grp['y_pred_cls_1'] = prd_te_to_grp.y_pred.map( lambda x: True if x >= 0.5 else False) prd_te_to_grp['y_true_unq_vals'] = prd_te_to_grp.y_true_cls # ----------------------- # Groupby Cell # ----------------------- by = 'CELL' sm_cell = prd_te_to_grp.groupby([by, 'y_true']).agg({ 'DRUG': 'unique', 'CTYPE': 'unique', 'y_true_unq_vals': 'unique', 'y_pred_prob_median': np.median, 'y_pred_prob_std': np.std, 'y_pred_cls_0': lambda x: int(sum(x)), 'y_pred_cls_1': lambda x: int(sum(x)), 'y_pred_tot': lambda x: len(np.unique(x)), }).reset_index().sort_values(by, ascending=True) sm_cell['y_true_unq_vals'] = sm_cell.y_true_unq_vals.map( lambda x: len(x) if type(x) == np.ndarray else 1) sm_cell = add_conf_data(sm_cell) sm_cell.to_csv(gout / 'sm_by_cell.csv', index=False) # ----------------------- # Groupby Cancer Type # ----------------------- by = 'CTYPE' sm_ctype = prd_te_to_grp.groupby([by, 'y_true']).agg({ 'DRUG': 'unique', 'CELL': 'unique', 'y_true_unq_vals': 'unique', 'y_pred_prob_median': np.median, 'y_pred_prob_std': np.std, 'y_pred_cls_0': lambda x: int(sum(x)), 'y_pred_cls_1': lambda x: int(sum(x)), 'y_pred_tot': lambda x: len(np.unique(x)), }).reset_index().sort_values(by, ascending=True) sm_ctype['y_true_unq_vals'] = sm_ctype.y_true_unq_vals.map( lambda x: len(x) if type(x) == np.ndarray else 1) sm_ctype = add_conf_data(sm_ctype) sm_ctype.to_csv(gout / 'sm_by_ctype.csv', index=False) # ----------------------- # Groupby Drug # ----------------------- by = 'DRUG' sm_drug = prd_te_to_grp.groupby([by, 'y_true']).agg({ 'CTYPE': 'unique', 'CELL': 'unique', 'y_true_unq_vals': 'unique', 'y_pred_prob_median': np.median, 'y_pred_prob_std': np.std, 'y_pred_cls_0': lambda x: int(sum(x)), 'y_pred_cls_1': lambda x: int(sum(x)), 'y_pred_tot': lambda x: len(np.unique(x)), }).reset_index().sort_values(by, ascending=True) sm_drug['y_true_unq_vals'] = sm_drug.y_true_unq_vals.map( lambda x: len(x) if type(x) == np.ndarray else 1) sm_drug = add_conf_data(sm_drug) sm_drug.to_csv(gout / 'sm_by_drug.csv', index=False) # -------------------------------------------------------- lg.kill_logger()
def trn_learning_curve( self, framework: str = 'lightgbm', mltype: str = 'reg', model_name: str = 'lgb_reg', # TODO! this is redundent init_kwargs: dict = {}, fit_kwargs: dict = {}, clr_keras_kwargs: dict = {}, metrics: list = [ 'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error', 'neg_mean_squared_error' ], n_jobs: int = 4, random_state: int = None, plot=True): """ Args: framework : ml framework (keras, lightgbm, or sklearn) mltype : type to ml problem (reg or cls) init_kwargs : dict of parameters that initialize the estimator fit_kwargs : dict of parameters to the estimator's fit() method clr_keras_kwargs : metrics : allow to pass a string of metrics TODO! """ self.framework = framework self.mltype = mltype self.model_name = model_name self.init_kwargs = init_kwargs self.fit_kwargs = fit_kwargs self.clr_keras_kwargs = clr_keras_kwargs self.metrics = metrics self.n_jobs = n_jobs self.random_state = random_state # Start nested loop of train size and cv folds tr_scores_all = [] # list of dicts vl_scores_all = [] # list of dicts te_scores_all = [] # list of dicts # Record runtime per shard runtime_records = [] # CV loop for fold, (tr_k, vl_k, te_k) in enumerate( zip(self.tr_dct.keys(), self.vl_dct.keys(), self.te_dct.keys())): fold = fold + 1 if self.logger is not None: self.logger.info(f'Fold {fold}/{self.cv_folds}') # Get the indices for this fold tr_id = self.tr_dct[tr_k] vl_id = self.vl_dct[vl_k] te_id = self.te_dct[te_k] # Samples from this dataset are randomly sampled for training xtr = self.X[tr_id, :] # ytr = self.Y[tr_id, :] ytr = np.squeeze(self.Y[tr_id, :]) # A fixed set of val samples for the current CV split xvl = self.X[vl_id, :] yvl = np.squeeze(self.Y[vl_id, :]) # A fixed set of test samples for the current CV split xte = self.X[te_id, :] yte = np.squeeze(self.Y[te_id, :]) # Shards loop (iterate across the dataset sizes and train) """ np.random.seed(random_state) idx = np.random.permutation(len(xtr)) Note that we don't shuffle the dataset another time using the commands above. """ idx = np.arange(len(xtr)) for i, tr_sz in enumerate(self.tr_shards): # For each shard: train model, save best model, calc tr_scores, calc_vl_scores if self.logger: self.logger.info( f'\tTrain size: {tr_sz} ({i+1}/{len(self.tr_shards)})') # Sequentially get a subset of samples (the input dataset X must be shuffled) xtr_sub = xtr[idx[:tr_sz], :] # ytr_sub = np.squeeze(ytr[idx[:tr_sz], :]) ytr_sub = ytr[idx[:tr_sz]] # Get the estimator estimator = ml_models.get_model(self.model_name, init_kwargs=self.init_kwargs) model = estimator.model # Train # self.val_split = 0 # 0.1 # used for early stopping #self.eval_frac = 0.1 # 0.1 # used for early stopping #eval_samples = int(self.eval_frac * xvl.shape[0]) #eval_set = (xvl[:eval_samples, :], yvl[:eval_samples]) # we don't random sample; the same eval_set is used for early stopping eval_set = (xvl, yvl) if self.framework == 'lightgbm': model, trn_outdir, runtime = self.trn_lgbm_model( model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=eval_set) elif self.framework == 'sklearn': model, trn_outdir, runtime = self.trn_sklearn_model( model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=None) elif self.framework == 'keras': model, trn_outdir, runtime = self.trn_keras_model( model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=eval_set) elif self.framework == 'pytorch': pass else: raise ValueError( f'Framework {self.framework} is not supported.') # Save plot of target distribution plot_hist(ytr_sub, var_name=f'Target (Train size={tr_sz})', fit=None, bins=100, path=trn_outdir / 'target_hist_tr.png') plot_hist(yvl, var_name=f'Target (Val size={len(yvl)})', fit=None, bins=100, path=trn_outdir / 'target_hist_vl.png') plot_hist(yte, var_name=f'Target (Test size={len(yte)})', fit=None, bins=100, path=trn_outdir / 'target_hist_te.png') # Calc preds and scores TODO: dump preds # ... training set y_pred, y_true = calc_preds(model, x=xtr_sub, y=ytr_sub, mltype=self.mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) tr_scores['y_avg'] = np.mean(y_pred) # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=self.mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) vl_scores['y_avg'] = np.mean(y_pred) # ... test set y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=self.mltype) te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) te_scores['y_avg'] = np.mean(y_pred) del estimator, model # Save predictions (need to include metadata) # TODO pass # Store runtime runtime_records.append((fold, tr_sz, runtime)) # Add metadata # tr_scores['tr_set'] = True tr_scores['set'] = 'tr' tr_scores['fold'] = 'fold' + str(fold) tr_scores['tr_size'] = tr_sz # vl_scores['tr_set'] = False vl_scores['set'] = 'vl' vl_scores['fold'] = 'fold' + str(fold) vl_scores['tr_size'] = tr_sz # te_scores['tr_set'] = False te_scores['set'] = 'te' te_scores['fold'] = 'fold' + str(fold) te_scores['tr_size'] = tr_sz # Append scores (dicts) tr_scores_all.append(tr_scores) vl_scores_all.append(vl_scores) te_scores_all.append(te_scores) # Dump intermediate scores # TODO: test this! scores_tmp = pd.concat([ scores_to_df([tr_scores]), scores_to_df([vl_scores]), scores_to_df([te_scores]) ], axis=0) scores_tmp.to_csv(trn_outdir / ('scores_tmp.csv'), index=False) del trn_outdir, scores_tmp # Dump intermediate results (this is useful if the run terminates before run ends) scores_all_df_tmp = pd.concat([ scores_to_df(tr_scores_all), scores_to_df(vl_scores_all), scores_to_df(te_scores_all) ], axis=0) scores_all_df_tmp.to_csv( self.outdir / ('_lrn_crv_scores_cv' + str(fold) + '.csv'), index=False) # Scores to df tr_scores_df = scores_to_df(tr_scores_all) vl_scores_df = scores_to_df(vl_scores_all) te_scores_df = scores_to_df(te_scores_all) scores_df = pd.concat([tr_scores_df, vl_scores_df, te_scores_df], axis=0) # Dump final results tr_scores_df.to_csv(self.outdir / 'tr_lrn_crv_scores.csv', index=False) vl_scores_df.to_csv(self.outdir / 'vl_lrn_crv_scores.csv', index=False) te_scores_df.to_csv(self.outdir / 'te_lrn_crv_scores.csv', index=False) scores_df.to_csv(self.outdir / 'lrn_crv_scores.csv', index=False) # Runtime df runtime_df = pd.DataFrame.from_records( runtime_records, columns=['fold', 'tr_sz', 'time']) runtime_df.to_csv(self.outdir / 'runtime.csv', index=False) # Plot learning curves if plot: plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir) plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir, xtick_scale='log2', ytick_scale='log2') plot_runtime(runtime_df, outdir=self.outdir, xtick_scale='log2', ytick_scale='log2') return scores_df
def run(args): dirpath = Path(args['dirpath']) target_name = args['target_name'] cv_folds = args['cv_folds'] # Features # cell_fea = args['cell_fea'] # drug_fea = args['drug_fea'] # fea_list = cell_fea + drug_fea # NN params epochs = args['epochs'] batch_size = args['batch_size'] dr_rate = args['dr_rate'] # Optimizer opt_name = args['opt'] clr_keras_kwargs = { 'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma'] } # Other params model_name = args['model_name'] skp_ep = args['skp_ep'] n_jobs = args['n_jobs'] # ML type ('reg' or 'cls') if 'reg' in model_name: mltype = 'reg' elif 'cls' in model_name: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") src = dirpath.name.split('_')[0] # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = Path(str(dirpath).split('_')[0] + '_trn') # os.makedirs(outdir, exist_ok=True) run_outdir = create_outdir(outdir, args, src) lg = Logger(run_outdir / 'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file dump_dict(args, outpath=run_outdir / 'args.txt') # ----------------------------------------------- # Load data and pre-proc # ----------------------------------------------- def get_file(fpath): return pd.read_csv( fpath, header=None).squeeze().values if fpath.is_file() else None def read_data_file(fpath, file_format='csv'): fpath = Path(fpath) if fpath.is_file(): if file_format == 'csv': df = pd.read_csv(fpath) elif file_format == 'parquet': df = pd.read_parquet(fpath) else: df = None return df # Data splits tr_id = pd.read_csv(dirpath / f'{cv_folds}fold_tr_id.csv') vl_id = pd.read_csv(dirpath / f'{cv_folds}fold_vl_id.csv') te_id = pd.read_csv(dirpath / 'te_id.csv') tr_dct = {} vl_dct = {} for fold in range(tr_id.shape[1]): tr_dct[fold] = tr_id.iloc[:, fold].dropna().values.astype(int).tolist() vl_dct[fold] = vl_id.iloc[:, fold].dropna().values.astype(int).tolist() te_id = te_id.iloc[:, 0].dropna().values.astype(int).tolist() # Load data lg.logger.info(f'\nLoading data ...') xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet') meta = read_data_file(dirpath / 'meta.parquet', 'parquet') ydata = meta[[target_name]] # Scale lg.logger.info(f'\nScaling data ...') scaler = args['scaler'] if scaler is not None: if scaler == 'stnd': scaler = StandardScaler() elif scaler == 'minmax': scaler = MinMaxScaler() elif scaler == 'rbst': scaler = RobustScaler() cols = xdata.columns xdata = pd.DataFrame(scaler.fit_transform(xdata), columns=cols, dtype=np.float32) # Test set xte = xdata.iloc[te_id, :] yte = np.squeeze(ydata.iloc[te_id, :]).values # ----------------------------------------------- # ML model configs # ----------------------------------------------- if model_name == 'lgb_reg': framework = 'lightgbm' init_kwargs = { 'n_jobs': n_jobs, 'random_state': SEED, 'logger': lg.logger } fit_kwargs = {'verbose': False} elif model_name == 'nn_reg': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'attn': attn, 'logger': lg.logger } fit_kwargs = {'batch_size': batch_size, 'epochs': epochs, 'verbose': 1} elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg2': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 elif model_name == 'nn_reg3' or 'nn_reg4': framework = 'keras' init_kwargs = { 'in_dim_rna': None, 'in_dim_dsc': None, 'dr_rate': dr_rate, 'opt_name': opt_name, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 # ----------------------------------------------- # Train # ----------------------------------------------- lg.logger.info('\n\n{}'.format('=' * 50)) lg.logger.info(f'Train {src} ...') lg.logger.info('=' * 50) # CV loop for fold, (tr_k, vl_k) in enumerate(zip(tr_dct.keys(), vl_dct.keys())): if lg.logger is not None: lg.logger.info(f'Fold {fold+1}/{cv_folds}') tr_id = tr_dct[tr_k] vl_id = vl_dct[vl_k] # Samples from this dataset are randomly sampled for training xtr = xdata.iloc[tr_id, :] ytr = np.squeeze(ydata.iloc[tr_id, :]).values # A fixed set of validation samples for the current CV split xvl = xdata.iloc[vl_id, :] yvl = np.squeeze(ydata.iloc[vl_id, :]).values # Get the estimator estimator = ml_models.get_model(model_name, init_kwargs=init_kwargs) model = estimator.model keras.utils.plot_model(model, to_file=run_outdir / 'nn_model.png') # Callbacks # keras_callbacks = define_keras_callbacks(run_outdir) model_checkpoint_dir = run_outdir / 'models' os.makedirs(model_checkpoint_dir, exist_ok=True) checkpointer = ModelCheckpoint(str( model_checkpoint_dir / 'model.ep_{epoch:d}-val_loss_{val_loss:.4f}-val_mae_{val_mean_absolute_error:.4f}.h5' ), save_best_only=False) csv_logger = CSVLogger(run_outdir / 'training.log') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=20, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) early_stop = EarlyStopping(monitor='val_loss', patience=60, verbose=1) keras_callbacks = [checkpointer, csv_logger, early_stop, reduce_lr] if clr_keras_kwargs['mode'] is not None: keras_callbacks.append( ml_models.clr_keras_callback(**clr_keras_kwargs)) # Fit params fit_kwargs['validation_data'] = (xvl, yvl) fit_kwargs['callbacks'] = keras_callbacks # Train t0 = time() history = model.fit(xtr, ytr, **fit_kwargs) lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600)) # Dump model, history, plots model.save(str(run_outdir / 'model_final.h5')) hh = ml_models.save_krs_history(history, outdir=run_outdir) ml_models.plot_prfrm_metrics(history, title=f'Training', skp_ep=skp_ep, add_lr=True, outdir=run_outdir) # Multi-gpu training # keras.utils.multi_gpu_model(model, gpus=[0, 1], cpu_merge=True, cpu_relocation=False) # Load the best model to make preds eval_metric = 'val_mean_absolute_error' ep_best = hh.loc[hh[eval_metric] == hh[eval_metric].min(), 'epoch'].values[0] mpath = glob( str(model_checkpoint_dir / f'model.ep_{ep_best}-val_loss*.h5'))[0] model = load_model(mpath) # Calc preds and scores # ... training set y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_dict(tr_scores, outpath=run_outdir / 'tr_scores.txt') pd.DataFrame({ 'y_true': y_true.reshape(-1), 'y_pred': y_pred.reshape(-1, ) }).to_csv(run_outdir / 'tr_preds.csv', index=False) # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_dict(vl_scores, outpath=run_outdir / 'vl_scores.txt') pd.DataFrame({ 'y_true': y_true.reshape(-1, ), 'y_pred': y_pred.reshape(-1, ) }).to_csv(run_outdir / 'vl_preds.csv', index=False) # Calc preds and scores for test set y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype) te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_dict(te_scores, outpath=run_outdir / 'te_scores.txt') pd.DataFrame({ 'y_true': y_true.reshape(-1), 'y_pred': y_pred.reshape(-1, ) }).to_csv(run_outdir / 'te_preds.csv', index=False) lg.kill_logger() del xdata, ydata print('Done.')
def trn_learning_curve( self, framework: str = 'lightgbm', mltype: str = 'reg', model_name: str = 'lgb_reg', # TODO! this is redundent init_kwargs: dict = {}, fit_kwargs: dict = {}, clr_keras_kwargs: dict = {}, metrics: list = [ 'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error', 'neg_mean_squared_error' ], n_jobs: int = 4, random_state: int = None, plot=True): """ Args: framework : ml framework (keras, lightgbm, or sklearn) mltype : type to ml problem (reg or cls) init_kwargs : dict of parameters that initialize the estimator fit_kwargs : dict of parameters to the estimator's fit() method clr_keras_kwargs : metrics : allow to pass a string of metrics TODO! """ self.framework = framework self.mltype = mltype self.model_name = model_name self.init_kwargs = init_kwargs self.fit_kwargs = fit_kwargs self.clr_keras_kwargs = clr_keras_kwargs self.metrics = metrics self.n_jobs = n_jobs self.random_state = random_state # Start nested loop of train size and cv folds tr_scores_all = [] # list of dicts vl_scores_all = [] # list of dicts # CV loop for fold, (tr_k, vl_k) in enumerate( zip(self.tr_dct.keys(), self.vl_dct.keys())): if self.logger is not None: self.logger.info(f'Fold {fold+1}/{self.cv_folds}') tr_id = self.tr_dct[tr_k] vl_id = self.vl_dct[vl_k] # Samples from this dataset are randomly sampled for training xtr = self.X[tr_id, :] ytr = self.Y[tr_id, :] # A fixed set of validation samples for the current CV split xvl = self.X[vl_id, :] yvl = np.squeeze(self.Y[vl_id, :]) # Shards loop (iterate across the dataset sizes and train) # np.random.seed(random_state) # idx = np.random.permutation(len(xtr)) idx = np.arange(len(xtr)) for i, tr_sz in enumerate(self.tr_shards): # For each shard: train model, save best model, calc tr_scores, calc_vl_scores if self.logger: self.logger.info( f'\tTrain size: {tr_sz} ({i+1}/{len(self.tr_shards)})') # Sequentially get a subset of samples (the input dataset X must be shuffled) xtr_sub = xtr[idx[:tr_sz], :] ytr_sub = np.squeeze(ytr[idx[:tr_sz], :]) # Get the estimator estimator = ml_models.get_model(self.model_name, init_kwargs=self.init_kwargs) model = estimator.model # Train # self.val_split = 0 # 0.1 # used for early stopping self.eval_frac = 0.1 # 0.1 # used for early stopping eval_samples = int(self.eval_frac * xvl.shape[0]) eval_set = (xvl[:eval_samples, :], yvl[:eval_samples]) if self.framework == 'lightgbm': model, trn_outdir = self.trn_lgbm_model(model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=eval_set) elif self.framework == 'keras': model, trn_outdir = self.trn_keras_model(model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=eval_set) elif self.framework == 'pytorch': pass else: raise ValueError( f'framework {self.framework} is not supported.') # Calc preds and scores TODO: dump preds # ... training set y_pred, y_true = calc_preds(model, x=xtr_sub, y=ytr_sub, mltype=self.mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=self.mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) del estimator, model # nm = ((y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64) # dn = ((y_true - np.average(y_true, axis=0)) ** 2).sum(axis=0, dtype=np.float64) # Add metadata tr_scores['tr_set'] = True tr_scores['fold'] = 'fold' + str(fold) tr_scores['tr_size'] = tr_sz vl_scores['tr_set'] = False vl_scores['fold'] = 'fold' + str(fold) vl_scores['tr_size'] = tr_sz # Append scores (dicts) tr_scores_all.append(tr_scores) vl_scores_all.append(vl_scores) # Dump intermediate scores # TODO pass scores_tmp = pd.concat( [scores_to_df(tr_scores_all), scores_to_df(vl_scores_all)], axis=0) scores_tmp.to_csv(trn_outdir / ('tmp_scores.csv'), index=False) del trn_outdir, tmp_scores # Dump intermediate results (this is useful if the run terminates before run ends) # tr_df_tmp = scores_to_df(tr_scores_all) # vl_df_tmp = scores_to_df(vl_scores_all) scores_all_df_tmp = pd.concat( [scores_to_df(tr_scores_all), scores_to_df(vl_scores_all)], axis=0) scores_all_df_tmp.to_csv( self.outdir / ('_lrn_crv_scores_cv' + str(fold + 1) + '.csv'), index=False) # Scores to df tr_scores_df = scores_to_df(tr_scores_all) vl_scores_df = scores_to_df(vl_scores_all) scores_df = pd.concat([tr_scores_df, vl_scores_df], axis=0) # Dump final results tr_scores_df.to_csv(self.outdir / 'tr_lrn_crv_scores.csv', index=False) vl_scores_df.to_csv(self.outdir / 'vl_lrn_crv_scores.csv', index=False) scores_df.to_csv(self.outdir / 'lrn_crv_scores.csv', index=False) # Plot learning curves if plot: plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir) return scores_df