Beispiel #1
0
def run(args):
    # Global outdir
    gout = Path(args['global_outdir'])
    os.makedirs(gout, exist_ok=True)

    # dirpath = verify_dirpath(args['dirpath'])
    data = read_data_file(filepath / args['filepath'], 'parquet')
    print('data.shape', data.shape)

    # Get features (x), target (y), and meta
    fea_list = args['cell_fea'] + args['drug_fea']
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_')
    meta = data.drop(columns=xdata.columns)
    ydata = meta[[args['target_name']]]
    del data

    # ML type ('reg' or 'cls')
    if 'reg' in args['model_name']:
        mltype = 'reg'
    elif 'cls' in args['model_name']:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    # Create logger
    lg = Logger(gout / 'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    def get_unq_split_ids(all_splits_path):
        """ List containing the full path of each split. """
        unq = [
            all_splits_path[i].split(os.sep)[-1].split('_')[1]
            for i, p in enumerate(all_splits_path)
        ]
        # unq = []
        # for i, p in enumerate(all_splits_path):
        #     sp = all_splits_path[i].split(os.sep)[-1].split('_')[1]
        # unq.append(sp)
        unq = np.unique(unq)
        return unq

    all_splits_path = glob(str(Path(args['splitpath']) / '1fold_*_id.csv'))
    unq_split_ids = get_unq_split_ids(all_splits_path)
    run_times = []

    # Append scores (dicts)
    tr_scores_all = []
    vl_scores_all = []
    te_scores_all = []

    # Sample size at each run
    smp_sz = []
    file_smp_sz = open(gout / 'sample_sz', 'w')
    file_smp_sz.write('run\ttr_sz\tvl_sz\tte_sz\n')

    # Iterate over splits
    n_splits = None if args['n_splits'] is None else (args['n_splits'] + 1)
    for i, split_id in enumerate(unq_split_ids[:n_splits]):
        # print(f'Split {split_id}')

        # Get indices for the split
        aa = [p for p in all_splits_path if f'1fold_{split_id}' in p]
        if len(aa) < 2:
            print(f'The split {s} contains only one file.')
            continue
        for id_file in aa:
            if 'tr_id' in id_file:
                tr_id = read_data_file(id_file)
            # elif 'vl_id' in id_file:
            #     # vl_id = read_data_file( id_file )
            #     te_id = read_data_file( id_file )
            elif 'vl_id' in id_file:
                vl_id = read_data_file(id_file)
            elif 'te_id' in id_file:
                te_id = read_data_file(id_file)

        # Define run outdir
        rout = gout / f'run_{split_id}'
        os.makedirs(rout, exist_ok=True)

        # Scaling
        # xdata = scale_fea(xdata=xdata, scaler_name=args['scaler'])  # scale features

        # Get training and val data
        # Extract Train set T, Validation set V, and Test set E
        tr_id = tr_id.iloc[:, 0].values.astype(int).tolist()
        vl_id = vl_id.iloc[:, 0].values.astype(int).tolist()
        te_id = te_id.iloc[:, 0].values.astype(int).tolist()
        xtr, ytr, mtr = get_data_by_id(
            tr_id, xdata, ydata,
            meta)  # samples from xtr are sequentially sampled for TRAIN
        xvl, yvl, mvl = get_data_by_id(
            vl_id, xdata, ydata,
            meta)  # fixed set of VAL samples for the current CV split
        xte, yte, mte = get_data_by_id(
            te_id, xdata, ydata,
            meta)  # fixed set of TEST samples for the current CV split

        # Extract val data
        # from sklearn.model_selection import train_test_split
        # id_arr = np.arange(len(xtr))
        # tr_, vl_ = train_test_split(id_arr, test_size=0.1)
        # xvl = xtr.iloc[vl_,:].reset_index(drop=True)
        # xtr = xtr.iloc[tr_,:].reset_index(drop=True)
        # mvl = mtr.iloc[vl_,:].reset_index(drop=True)
        # mtr = mtr.iloc[tr_,:].reset_index(drop=True)
        # yvl = ytr.iloc[vl_].reset_index(drop=True)
        # ytr = ytr.iloc[tr_].reset_index(drop=True)

        # Remove AUC gap
        min_gap = args['min_gap']
        max_gap = args['max_gap']
        if (min_gap is not None) & (max_gap is not None):
            idx = (ytr.values > min_gap) & (ytr.values < max_gap)
            xtr = xtr[~idx]
            mtr = mtr[~idx]
            ytr = ytr[~idx]

        def drop_samples(x_df, y_df, m_df, items_to_drop, drop_by: str):
            """
            Args:
                drop_by : col in df ('CELL', 'DRUG', 'CTYPE')
            """
            id_drop = m_df[drop_by].isin(items_to_drop)
            x_df = x_df[~id_drop].reset_index(drop=True)
            y_df = y_df[~id_drop].reset_index(drop=True)
            m_df = m_df[~id_drop].reset_index(drop=True)
            return x_df, y_df, m_df

        # Dump cell lines
        # if args['cell_list_drop'] is not None:
        #     cell_to_drop_fpath = Path(args['cell_list_drop'])
        # cell_to_drop_fname = 'cell_list_tmp'
        # cell_to_drop_fpath = filepath / cell_to_drop_fname
        if args['cell_list_drop'] is not None:
            cell_to_drop_fpath = Path(args['cell_list_drop'])
            if cell_to_drop_fpath.exists():
                # with open(cell_to_drop_fpath, 'r') as f:
                with open(cell_to_path_fpath, 'r') as f:
                    cells_to_drop = [line.rstrip() for line in f]
                    xtr, ytr, mtr = drop_samples(x_df=xtr,
                                                 y_df=ytr,
                                                 m_df=mtr,
                                                 items_to_drop=cells_to_drop)
                    xvl, yvl, mvl = drop_samples(x_df=xvl,
                                                 y_df=yvl,
                                                 m_df=mvl,
                                                 items_to_drop=cells_to_drop)
                    xte, yte, mte = drop_samples(x_df=xte,
                                                 y_df=yte,
                                                 m_df=mte,
                                                 items_to_drop=cells_to_drop)

        line = 's{}\t{}\t{}\t{}\n'.format(split_id, xtr.shape[0], xvl.shape[0],
                                          xte.shape[0])
        file_smp_sz.write(line)

        # Adjust the responses
        if mltype == 'cls':
            ytr = bin_rsp(ytr, resp_thres=0.5)
            yvl = bin_rsp(yvl, resp_thres=0.5)
            yte = bin_rsp(yte, resp_thres=0.5)

        # Define ML model
        if 'lgb' in args['model_name']:
            args['framework'] = 'lightgbm'
        elif args['model_name'] == 'rf_reg':
            args['framework'] = 'sklearn'
        elif 'nn_' in args['model_name']:
            args['framework'] = 'keras'

        model_init_kwargs, model_fit_kwargs = get_model_kwargs(args)

        # Get the estimator
        estimator = ml_models.get_model(args['model_name'],
                                        init_kwargs=model_init_kwargs)
        model = estimator.model

        # Train
        eval_set = (xvl, yvl)
        # eval_set = None
        if args['framework'] == 'lightgbm':
            model, runtime = trn_lgbm_model(model=model,
                                            xtr=xtr,
                                            ytr=ytr,
                                            eval_set=eval_set,
                                            fit_kwargs=model_fit_kwargs)
        elif args['framework'] == 'sklearn':
            model, runtime = trn_sklearn_model(model=model,
                                               xtr_sub=xtr,
                                               ytr_sub=ytr,
                                               eval_set=None,
                                               fit_kwargs=model_fit_kwargs)
        elif args['framework'] == 'keras':
            model, runtime = trn_keras_model(model=model,
                                             xtr_sub=xtr,
                                             ytr_sub=ytr,
                                             eval_set=eval_set)
        elif args['framework'] == 'pytorch':
            pass
        else:
            raise ValueError(f'Framework {framework} is not yet supported.')

        if model is None:
            continue  # sometimes keras fails to train a model (evaluates to nan)

        # Append runtime
        run_times.append(runtime)

        # Dump model
        if args['save_model']:
            joblib.dump(model,
                        filename=rout /
                        ('model.' + args['model_name'] + '.pkl'))

        # Calc preds and scores
        # ... training set
        y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype)
        tr_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_preds(y_true, y_pred, meta=mtr, outpath=rout / 'preds_tr.csv')
        # ... val set
        y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype)
        vl_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_preds(y_true, y_pred, meta=mvl, outpath=rout / 'preds_vl.csv')
        # ... test set
        y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype)
        te_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_preds(y_true, y_pred, meta=mte, outpath=rout / 'preds_te.csv')

        # Add metadata
        tr_scores['run'] = split_id
        vl_scores['run'] = split_id
        te_scores['run'] = split_id

        # Append scores (dicts)
        tr_scores_all.append(tr_scores)
        vl_scores_all.append(vl_scores)
        te_scores_all.append(te_scores)

        # Free space
        # del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, tr_, vl_
        del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, eval_set, model, estimator

        if i % 10 == 0:
            print(f'Finished {split_id}')

    file_smp_sz.close()

    # Scores to df
    tr_scores_df = scores_to_df(tr_scores_all)
    vl_scores_df = scores_to_df(vl_scores_all)
    te_scores_df = scores_to_df(te_scores_all)

    tr_scores_df.to_csv(gout / 'tr_scores.csv', index=False)
    vl_scores_df.to_csv(gout / 'vl_scores.csv', index=False)
    te_scores_df.to_csv(gout / 'te_scores.csv', index=False)

    if (time() - t0) // 3600 > 0:
        lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))
    else:
        lg.logger.info('Runtime: {:.1f} min'.format((time() - t0) / 60))

    del tr_scores_df, vl_scores_df, te_scores_df

    # --------------------------------------------------------
    # Calc stats
    def reorg_cols(df, col_first: str):
        """
        Args:
            col_first : col name to put first
        """
        cols = df.columns.tolist()
        cols.remove(col_first)
        return df[[col_first] + cols]

    def agg_preds_from_cls_runs(runs_dirs, phase='_te.csv', verbose=False):
        """ Aggregate predictions bootstraped ML trainings. """
        prd = []
        for i, dir_name in enumerate(runs_dirs):
            if '_tr.csv' in phase:
                prd_ = pd.read_csv(dir_name / 'preds_tr.csv')
            elif '_vl.csv' in phase:
                prd_ = pd.read_csv(dir_name / 'preds_vl.csv')
            elif '_te.csv' in phase:
                prd_ = pd.read_csv(dir_name / 'preds_te.csv')

            # prd_te_['err'] = abs(prd_te_['y_true'] - prd_te_['y_pred'])      # add col 'err'
            prd_['run'] = str(dir_name).split(
                os.sep)[-1].split('_')[-1]  # add col 'run' identifier
            prd.append(prd_)  # append run data

            if verbose:
                if i % 20 == 0:
                    print(f'Processing {dir_name}')

        # Aggregate to df
        prd = pd.concat(prd, axis=0)

        # Reorganize cols
        prd = reorg_cols(prd, col_first='run').sort_values('run').reset_index(
            drop=True).reset_index().rename(columns={'index': 'idx'})
        return prd

    # Concat preds from all runs
    runs_dirs = [Path(p) for p in glob(str(gout / 'run_*'))]
    prd_te_all = agg_preds_from_cls_runs(runs_dirs, phase='_te.csv')
    if 'source' not in [str(i).lower() for i in prd_te_all.columns.to_list()]:
        prd_te_all.insert(
            loc=2,
            column='SOURCE',
            value=[s.split('.')[0].lower() for s in prd_te_all['CELL']])

    # Cancer types
    cancer_types = pd.read_csv(filepath / 'data/combined_cancer_types',
                               sep='\t',
                               names=['CELL', 'CTYPE'])

    # Add CTYPE columns
    prd_te_all = pd.merge(prd_te_all, cancer_types, on='CELL')
    prd_te_all = reorg_cols(prd_te_all, col_first='CTYPE')

    # Rename
    prd_te_all = prd_te_all.rename(columns={
        'y_true': 'y_true_cls',
        'y_pred': 'y_pred_prob'
    })

    # Retain specific columns
    cols = [
        'idx', 'run', 'SOURCE', 'CTYPE', 'CELL', 'DRUG', 'R2fit', 'AUC',
        'y_true_cls', 'y_pred_prob'
    ]
    prd_te_all = prd_te_all[cols]

    # Add col of pred labels
    prd_te_all['y_pred_cls'] = prd_te_all.y_pred_prob.map(lambda x: 0
                                                          if x < 0.5 else 1)

    # The highest error is 0.5 while the lowest is 0.
    # This value is proportional to the square root of Brier score.
    prd_te_all['prob_err'] = abs(prd_te_all.y_true_cls -
                                 prd_te_all.y_pred_prob)

    # Bin AUC values
    bins = np.arange(0, 1.1, 0.1).tolist()
    prd_te_all['AUC_bin'] = pd.cut(prd_te_all.AUC,
                                   bins,
                                   right=True,
                                   labels=None,
                                   retbins=False,
                                   precision=3,
                                   include_lowest=False,
                                   duplicates='raise')

    # Add col that cetegorizes the preds
    prd_te_all['prd_cat'] = None
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1)
                       & (prd_te_all.y_pred_cls == 1)] = 'TP'
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0)
                       & (prd_te_all.y_pred_cls == 0)] = 'TN'
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1)
                       & (prd_te_all.y_pred_cls == 0)] = 'FN'
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0)
                       & (prd_te_all.y_pred_cls == 1)] = 'FP'

    # Add cols
    prd_te_all['TP'] = prd_te_all.prd_cat == 'TP'
    prd_te_all['TN'] = prd_te_all.prd_cat == 'TN'
    prd_te_all['FP'] = prd_te_all.prd_cat == 'FP'
    prd_te_all['FN'] = prd_te_all.prd_cat == 'FN'

    # Save aggregated master table
    prd_te_all.to_csv('prd_te_all.csv', index=False)

    # Plot confusion matrix
    from sklearn.metrics import confusion_matrix
    # y_true_cls = prd_te_all.y_true_cls
    # y_pred_cls = prd_te_all.y_pred.map(lambda x: 0 if x<0.5 else 1)
    y_true_cls = prd_te_all.y_true_cls
    y_pred_cls = prd_te_all.y_pred_cls
    np_conf = confusion_matrix(y_true_cls, y_pred_cls)
    tn, fp, fn, tp = confusion_matrix(y_true_cls, y_pred_cls).ravel()

    mcc = sklearn.metrics.matthews_corrcoef(y_true_cls,
                                            y_pred_cls,
                                            sample_weight=None)
    print('TN:', tn)
    print('FP:', fp)
    print('FN:', fn)
    print('TP:', tp)
    print('FPR:', fp / (fp + tn))
    print('FNR:', fn / (fn + tp))
    print('MCC:', mcc)

    with open(gout / 'scores.txt', 'w') as f:
        f.write('TN: {:d}\n'.format(tn))
        f.write('TN: {:d}\n'.format(tn))
        f.write('FP: {:d}\n'.format(fp))
        f.write('FN: {:d}\n'.format(fn))
        f.write('TP: {:d}\n'.format(tp))
        f.write('FPR: {:.5f}\n'.format(fp / (fp + tn)))
        f.write('FNR: {:.5f}\n'.format(fn / (fn + tp)))
        f.write('MCC: {:.5f}\n'.format(mcc))

    # Confusion Matrix
    conf = confusion_matrix(y_true_cls, y_pred_cls, normalize=None)
    conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp'])
    conf_plot.plot(include_values=True,
                   cmap=plt.cm.Blues,
                   ax=None,
                   xticks_rotation=None,
                   values_format='d')
    plt.savefig(gout / 'conf_mat.png', dpi=100)

    # Confusion Matrix (normalized)
    conf = confusion_matrix(y_true_cls, y_pred_cls, normalize='all')
    conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp'])
    conf_plot.plot(include_values=True,
                   cmap=plt.cm.Blues,
                   ax=None,
                   xticks_rotation=None,
                   values_format='.2f')
    conf_plot.ax_.set_title('Normalized')
    plt.savefig(gout / 'conf_mat_norm.png', dpi=100)

    def add_conf_data(data):
        """ Add columns are used to calc confusion matrix TP, TN, FN, FP. """
        data['TP'] = data.apply(lambda row: row.y_pred_cls_1
                                if row.y_true == 1 else False,
                                axis=1)  # tp
        data['TN'] = data.apply(lambda row: row.y_pred_cls_0
                                if row.y_true == 0 else False,
                                axis=1)  # tn
        data['FN'] = data.apply(lambda row: row.y_pred_cls_0
                                if row.y_true == 1 else False,
                                axis=1)  # fn
        data['FP'] = data.apply(lambda row: row.y_pred_cls_1
                                if row.y_true == 0 else False,
                                axis=1)  # fp

        data['TPR'] = data.apply(
            lambda row: np.nan
            if (row.TP == 0) & (row.FN == 0) else row.TP / (row.TP + row.FN),
            axis=1)  # sensitivity, recall: TP/P = TP/(TP+FN)
        data['TNR'] = data.apply(lambda row: np.nan
                                 if (row.TN == 0) & (row.FP == 0) else row.TN /
                                 (row.TN + row.FP),
                                 axis=1)  # specificity: TN/N = TN/(TN+FP)

        data['FPR'] = data.apply(lambda row: np.nan
                                 if (row.TN == 0) & (row.FP == 0) else row.FP /
                                 (row.TN + row.FP),
                                 axis=1)  # fall-out: FP/N = FP/(FP+TN)
        data['FNR'] = data.apply(lambda row: np.nan
                                 if (row.TP == 0) & (row.FN == 0) else row.FN /
                                 (row.TP + row.FN),
                                 axis=1)  # miss-rate: FN/NP = FN/(FN+TP)
        return data

    # Summary table
    prd_te_to_grp = prd_te_all.copy()
    prd_te_to_grp['y_pred_prob_median'] = prd_te_to_grp.y_pred_prob
    prd_te_to_grp['y_pred_prob_std'] = prd_te_to_grp.y_pred_prob
    prd_te_to_grp['y_pred_tot'] = prd_te_to_grp.idx
    prd_te_to_grp['y_pred_cls_0'] = prd_te_to_grp.y_pred.map(
        lambda x: True if x < 0.5 else False)
    prd_te_to_grp['y_pred_cls_1'] = prd_te_to_grp.y_pred.map(
        lambda x: True if x >= 0.5 else False)
    prd_te_to_grp['y_true_unq_vals'] = prd_te_to_grp.y_true_cls

    # -----------------------
    # Groupby Cell
    # -----------------------
    by = 'CELL'
    sm_cell = prd_te_to_grp.groupby([by, 'y_true']).agg({
        'DRUG':
        'unique',
        'CTYPE':
        'unique',
        'y_true_unq_vals':
        'unique',
        'y_pred_prob_median':
        np.median,
        'y_pred_prob_std':
        np.std,
        'y_pred_cls_0':
        lambda x: int(sum(x)),
        'y_pred_cls_1':
        lambda x: int(sum(x)),
        'y_pred_tot':
        lambda x: len(np.unique(x)),
    }).reset_index().sort_values(by, ascending=True)

    sm_cell['y_true_unq_vals'] = sm_cell.y_true_unq_vals.map(
        lambda x: len(x) if type(x) == np.ndarray else 1)
    sm_cell = add_conf_data(sm_cell)
    sm_cell.to_csv(gout / 'sm_by_cell.csv', index=False)

    # -----------------------
    # Groupby Cancer Type
    # -----------------------
    by = 'CTYPE'
    sm_ctype = prd_te_to_grp.groupby([by, 'y_true']).agg({
        'DRUG':
        'unique',
        'CELL':
        'unique',
        'y_true_unq_vals':
        'unique',
        'y_pred_prob_median':
        np.median,
        'y_pred_prob_std':
        np.std,
        'y_pred_cls_0':
        lambda x: int(sum(x)),
        'y_pred_cls_1':
        lambda x: int(sum(x)),
        'y_pred_tot':
        lambda x: len(np.unique(x)),
    }).reset_index().sort_values(by, ascending=True)

    sm_ctype['y_true_unq_vals'] = sm_ctype.y_true_unq_vals.map(
        lambda x: len(x) if type(x) == np.ndarray else 1)
    sm_ctype = add_conf_data(sm_ctype)
    sm_ctype.to_csv(gout / 'sm_by_ctype.csv', index=False)

    # -----------------------
    # Groupby Drug
    # -----------------------
    by = 'DRUG'
    sm_drug = prd_te_to_grp.groupby([by, 'y_true']).agg({
        'CTYPE':
        'unique',
        'CELL':
        'unique',
        'y_true_unq_vals':
        'unique',
        'y_pred_prob_median':
        np.median,
        'y_pred_prob_std':
        np.std,
        'y_pred_cls_0':
        lambda x: int(sum(x)),
        'y_pred_cls_1':
        lambda x: int(sum(x)),
        'y_pred_tot':
        lambda x: len(np.unique(x)),
    }).reset_index().sort_values(by, ascending=True)

    sm_drug['y_true_unq_vals'] = sm_drug.y_true_unq_vals.map(
        lambda x: len(x) if type(x) == np.ndarray else 1)
    sm_drug = add_conf_data(sm_drug)
    sm_drug.to_csv(gout / 'sm_by_drug.csv', index=False)

    # --------------------------------------------------------
    lg.kill_logger()
Beispiel #2
0
    def trn_learning_curve(
            self,
            framework: str = 'lightgbm',
            mltype: str = 'reg',
            model_name: str = 'lgb_reg',  # TODO! this is redundent
            init_kwargs: dict = {},
            fit_kwargs: dict = {},
            clr_keras_kwargs: dict = {},
            metrics: list = [
                'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error',
                'neg_mean_squared_error'
            ],
            n_jobs: int = 4,
            random_state: int = None,
            plot=True):
        """ 
        Args:
            framework : ml framework (keras, lightgbm, or sklearn)
            mltype : type to ml problem (reg or cls)
            init_kwargs : dict of parameters that initialize the estimator
            fit_kwargs : dict of parameters to the estimator's fit() method
            clr_keras_kwargs : 
            metrics : allow to pass a string of metrics  TODO!
        """
        self.framework = framework
        self.mltype = mltype
        self.model_name = model_name
        self.init_kwargs = init_kwargs
        self.fit_kwargs = fit_kwargs
        self.clr_keras_kwargs = clr_keras_kwargs
        self.metrics = metrics
        self.n_jobs = n_jobs
        self.random_state = random_state

        # Start nested loop of train size and cv folds
        tr_scores_all = []  # list of dicts
        vl_scores_all = []  # list of dicts
        te_scores_all = []  # list of dicts

        # Record runtime per shard
        runtime_records = []

        # CV loop
        for fold, (tr_k, vl_k, te_k) in enumerate(
                zip(self.tr_dct.keys(), self.vl_dct.keys(),
                    self.te_dct.keys())):
            fold = fold + 1
            if self.logger is not None:
                self.logger.info(f'Fold {fold}/{self.cv_folds}')

            # Get the indices for this fold
            tr_id = self.tr_dct[tr_k]
            vl_id = self.vl_dct[vl_k]
            te_id = self.te_dct[te_k]

            # Samples from this dataset are randomly sampled for training
            xtr = self.X[tr_id, :]
            # ytr = self.Y[tr_id, :]
            ytr = np.squeeze(self.Y[tr_id, :])

            # A fixed set of val samples for the current CV split
            xvl = self.X[vl_id, :]
            yvl = np.squeeze(self.Y[vl_id, :])

            # A fixed set of test samples for the current CV split
            xte = self.X[te_id, :]
            yte = np.squeeze(self.Y[te_id, :])

            # Shards loop (iterate across the dataset sizes and train)
            """
            np.random.seed(random_state)
            idx = np.random.permutation(len(xtr))
            Note that we don't shuffle the dataset another time using the commands above.
            """
            idx = np.arange(len(xtr))
            for i, tr_sz in enumerate(self.tr_shards):
                # For each shard: train model, save best model, calc tr_scores, calc_vl_scores
                if self.logger:
                    self.logger.info(
                        f'\tTrain size: {tr_sz} ({i+1}/{len(self.tr_shards)})')

                # Sequentially get a subset of samples (the input dataset X must be shuffled)
                xtr_sub = xtr[idx[:tr_sz], :]
                # ytr_sub = np.squeeze(ytr[idx[:tr_sz], :])
                ytr_sub = ytr[idx[:tr_sz]]

                # Get the estimator
                estimator = ml_models.get_model(self.model_name,
                                                init_kwargs=self.init_kwargs)
                model = estimator.model

                # Train
                # self.val_split = 0 # 0.1 # used for early stopping
                #self.eval_frac = 0.1 # 0.1 # used for early stopping
                #eval_samples = int(self.eval_frac * xvl.shape[0])
                #eval_set = (xvl[:eval_samples, :], yvl[:eval_samples]) # we don't random sample; the same eval_set is used for early stopping
                eval_set = (xvl, yvl)
                if self.framework == 'lightgbm':
                    model, trn_outdir, runtime = self.trn_lgbm_model(
                        model=model,
                        xtr_sub=xtr_sub,
                        ytr_sub=ytr_sub,
                        fold=fold,
                        tr_sz=tr_sz,
                        eval_set=eval_set)
                elif self.framework == 'sklearn':
                    model, trn_outdir, runtime = self.trn_sklearn_model(
                        model=model,
                        xtr_sub=xtr_sub,
                        ytr_sub=ytr_sub,
                        fold=fold,
                        tr_sz=tr_sz,
                        eval_set=None)
                elif self.framework == 'keras':
                    model, trn_outdir, runtime = self.trn_keras_model(
                        model=model,
                        xtr_sub=xtr_sub,
                        ytr_sub=ytr_sub,
                        fold=fold,
                        tr_sz=tr_sz,
                        eval_set=eval_set)
                elif self.framework == 'pytorch':
                    pass
                else:
                    raise ValueError(
                        f'Framework {self.framework} is not supported.')

                # Save plot of target distribution
                plot_hist(ytr_sub,
                          var_name=f'Target (Train size={tr_sz})',
                          fit=None,
                          bins=100,
                          path=trn_outdir / 'target_hist_tr.png')
                plot_hist(yvl,
                          var_name=f'Target (Val size={len(yvl)})',
                          fit=None,
                          bins=100,
                          path=trn_outdir / 'target_hist_vl.png')
                plot_hist(yte,
                          var_name=f'Target (Test size={len(yte)})',
                          fit=None,
                          bins=100,
                          path=trn_outdir / 'target_hist_te.png')

                # Calc preds and scores TODO: dump preds
                # ... training set
                y_pred, y_true = calc_preds(model,
                                            x=xtr_sub,
                                            y=ytr_sub,
                                            mltype=self.mltype)
                tr_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                tr_scores['y_avg'] = np.mean(y_pred)
                # ... val set
                y_pred, y_true = calc_preds(model,
                                            x=xvl,
                                            y=yvl,
                                            mltype=self.mltype)
                vl_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                vl_scores['y_avg'] = np.mean(y_pred)
                # ... test set
                y_pred, y_true = calc_preds(model,
                                            x=xte,
                                            y=yte,
                                            mltype=self.mltype)
                te_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                te_scores['y_avg'] = np.mean(y_pred)

                del estimator, model

                # Save predictions (need to include metadata)
                # TODO
                pass

                # Store runtime
                runtime_records.append((fold, tr_sz, runtime))

                # Add metadata
                # tr_scores['tr_set'] = True
                tr_scores['set'] = 'tr'
                tr_scores['fold'] = 'fold' + str(fold)
                tr_scores['tr_size'] = tr_sz

                # vl_scores['tr_set'] = False
                vl_scores['set'] = 'vl'
                vl_scores['fold'] = 'fold' + str(fold)
                vl_scores['tr_size'] = tr_sz

                # te_scores['tr_set'] = False
                te_scores['set'] = 'te'
                te_scores['fold'] = 'fold' + str(fold)
                te_scores['tr_size'] = tr_sz

                # Append scores (dicts)
                tr_scores_all.append(tr_scores)
                vl_scores_all.append(vl_scores)
                te_scores_all.append(te_scores)

                # Dump intermediate scores
                # TODO: test this!
                scores_tmp = pd.concat([
                    scores_to_df([tr_scores]),
                    scores_to_df([vl_scores]),
                    scores_to_df([te_scores])
                ],
                                       axis=0)
                scores_tmp.to_csv(trn_outdir / ('scores_tmp.csv'), index=False)
                del trn_outdir, scores_tmp

            # Dump intermediate results (this is useful if the run terminates before run ends)
            scores_all_df_tmp = pd.concat([
                scores_to_df(tr_scores_all),
                scores_to_df(vl_scores_all),
                scores_to_df(te_scores_all)
            ],
                                          axis=0)
            scores_all_df_tmp.to_csv(
                self.outdir / ('_lrn_crv_scores_cv' + str(fold) + '.csv'),
                index=False)

        # Scores to df
        tr_scores_df = scores_to_df(tr_scores_all)
        vl_scores_df = scores_to_df(vl_scores_all)
        te_scores_df = scores_to_df(te_scores_all)
        scores_df = pd.concat([tr_scores_df, vl_scores_df, te_scores_df],
                              axis=0)

        # Dump final results
        tr_scores_df.to_csv(self.outdir / 'tr_lrn_crv_scores.csv', index=False)
        vl_scores_df.to_csv(self.outdir / 'vl_lrn_crv_scores.csv', index=False)
        te_scores_df.to_csv(self.outdir / 'te_lrn_crv_scores.csv', index=False)
        scores_df.to_csv(self.outdir / 'lrn_crv_scores.csv', index=False)

        # Runtime df
        runtime_df = pd.DataFrame.from_records(
            runtime_records, columns=['fold', 'tr_sz', 'time'])
        runtime_df.to_csv(self.outdir / 'runtime.csv', index=False)

        # Plot learning curves
        if plot:
            plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir)
            plot_lrn_crv_all_metrics(scores_df,
                                     outdir=self.outdir,
                                     xtick_scale='log2',
                                     ytick_scale='log2')
            plot_runtime(runtime_df,
                         outdir=self.outdir,
                         xtick_scale='log2',
                         ytick_scale='log2')

        return scores_df
Beispiel #3
0
def run(args):
    dirpath = Path(args['dirpath'])
    target_name = args['target_name']
    cv_folds = args['cv_folds']

    # Features
    # cell_fea = args['cell_fea']
    # drug_fea = args['drug_fea']
    # fea_list = cell_fea + drug_fea

    # NN params
    epochs = args['epochs']
    batch_size = args['batch_size']
    dr_rate = args['dr_rate']

    # Optimizer
    opt_name = args['opt']
    clr_keras_kwargs = {
        'mode': args['clr_mode'],
        'base_lr': args['clr_base_lr'],
        'max_lr': args['clr_max_lr'],
        'gamma': args['clr_gamma']
    }

    # Other params
    model_name = args['model_name']
    skp_ep = args['skp_ep']
    n_jobs = args['n_jobs']

    # ML type ('reg' or 'cls')
    if 'reg' in model_name:
        mltype = 'reg'
    elif 'cls' in model_name:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    src = dirpath.name.split('_')[0]

    # -----------------------------------------------
    #       Create outdir and logger
    # -----------------------------------------------
    outdir = Path(str(dirpath).split('_')[0] + '_trn')
    # os.makedirs(outdir, exist_ok=True)
    run_outdir = create_outdir(outdir, args, src)
    lg = Logger(run_outdir / 'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    # Dump args to file
    dump_dict(args, outpath=run_outdir / 'args.txt')

    # -----------------------------------------------
    #       Load data and pre-proc
    # -----------------------------------------------
    def get_file(fpath):
        return pd.read_csv(
            fpath, header=None).squeeze().values if fpath.is_file() else None

    def read_data_file(fpath, file_format='csv'):
        fpath = Path(fpath)
        if fpath.is_file():
            if file_format == 'csv':
                df = pd.read_csv(fpath)
            elif file_format == 'parquet':
                df = pd.read_parquet(fpath)
        else:
            df = None
        return df

    # Data splits
    tr_id = pd.read_csv(dirpath / f'{cv_folds}fold_tr_id.csv')
    vl_id = pd.read_csv(dirpath / f'{cv_folds}fold_vl_id.csv')
    te_id = pd.read_csv(dirpath / 'te_id.csv')

    tr_dct = {}
    vl_dct = {}

    for fold in range(tr_id.shape[1]):
        tr_dct[fold] = tr_id.iloc[:, fold].dropna().values.astype(int).tolist()
        vl_dct[fold] = vl_id.iloc[:, fold].dropna().values.astype(int).tolist()

    te_id = te_id.iloc[:, 0].dropna().values.astype(int).tolist()

    # Load data
    lg.logger.info(f'\nLoading data ...')
    xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet')
    meta = read_data_file(dirpath / 'meta.parquet', 'parquet')
    ydata = meta[[target_name]]

    # Scale
    lg.logger.info(f'\nScaling data ...')
    scaler = args['scaler']
    if scaler is not None:
        if scaler == 'stnd':
            scaler = StandardScaler()
        elif scaler == 'minmax':
            scaler = MinMaxScaler()
        elif scaler == 'rbst':
            scaler = RobustScaler()

    cols = xdata.columns
    xdata = pd.DataFrame(scaler.fit_transform(xdata),
                         columns=cols,
                         dtype=np.float32)

    # Test set
    xte = xdata.iloc[te_id, :]
    yte = np.squeeze(ydata.iloc[te_id, :]).values

    # -----------------------------------------------
    #      ML model configs
    # -----------------------------------------------
    if model_name == 'lgb_reg':
        framework = 'lightgbm'
        init_kwargs = {
            'n_jobs': n_jobs,
            'random_state': SEED,
            'logger': lg.logger
        }
        fit_kwargs = {'verbose': False}
    elif model_name == 'nn_reg':
        framework = 'keras'
        init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'attn': attn,
            'logger': lg.logger
        }
        fit_kwargs = {'batch_size': batch_size, 'epochs': epochs, 'verbose': 1}
    elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg2':
        framework = 'keras'
        init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'logger': lg.logger
        }
        fit_kwargs = {
            'batch_size': batch_size,
            'epochs': epochs,
            'verbose': 1
        }  # 'validation_split': 0.1
    elif model_name == 'nn_reg3' or 'nn_reg4':
        framework = 'keras'
        init_kwargs = {
            'in_dim_rna': None,
            'in_dim_dsc': None,
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'logger': lg.logger
        }
        fit_kwargs = {
            'batch_size': batch_size,
            'epochs': epochs,
            'verbose': 1
        }  # 'validation_split': 0.1

    # -----------------------------------------------
    #      Train
    # -----------------------------------------------
    lg.logger.info('\n\n{}'.format('=' * 50))
    lg.logger.info(f'Train {src} ...')
    lg.logger.info('=' * 50)

    # CV loop
    for fold, (tr_k, vl_k) in enumerate(zip(tr_dct.keys(), vl_dct.keys())):
        if lg.logger is not None: lg.logger.info(f'Fold {fold+1}/{cv_folds}')

        tr_id = tr_dct[tr_k]
        vl_id = vl_dct[vl_k]

        # Samples from this dataset are randomly sampled for training
        xtr = xdata.iloc[tr_id, :]
        ytr = np.squeeze(ydata.iloc[tr_id, :]).values

        # A fixed set of validation samples for the current CV split
        xvl = xdata.iloc[vl_id, :]
        yvl = np.squeeze(ydata.iloc[vl_id, :]).values

        # Get the estimator
        estimator = ml_models.get_model(model_name, init_kwargs=init_kwargs)
        model = estimator.model

        keras.utils.plot_model(model, to_file=run_outdir / 'nn_model.png')

        # Callbacks
        # keras_callbacks = define_keras_callbacks(run_outdir)
        model_checkpoint_dir = run_outdir / 'models'
        os.makedirs(model_checkpoint_dir, exist_ok=True)
        checkpointer = ModelCheckpoint(str(
            model_checkpoint_dir /
            'model.ep_{epoch:d}-val_loss_{val_loss:.4f}-val_mae_{val_mean_absolute_error:.4f}.h5'
        ),
                                       save_best_only=False)
        csv_logger = CSVLogger(run_outdir / 'training.log')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.75,
                                      patience=20,
                                      verbose=1,
                                      mode='auto',
                                      min_delta=0.0001,
                                      cooldown=3,
                                      min_lr=0.000000001)
        early_stop = EarlyStopping(monitor='val_loss', patience=60, verbose=1)
        keras_callbacks = [checkpointer, csv_logger, early_stop, reduce_lr]

        if clr_keras_kwargs['mode'] is not None:
            keras_callbacks.append(
                ml_models.clr_keras_callback(**clr_keras_kwargs))

        # Fit params
        fit_kwargs['validation_data'] = (xvl, yvl)
        fit_kwargs['callbacks'] = keras_callbacks

        # Train
        t0 = time()
        history = model.fit(xtr, ytr, **fit_kwargs)
        lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))

        # Dump model, history, plots
        model.save(str(run_outdir / 'model_final.h5'))
        hh = ml_models.save_krs_history(history, outdir=run_outdir)
        ml_models.plot_prfrm_metrics(history,
                                     title=f'Training',
                                     skp_ep=skp_ep,
                                     add_lr=True,
                                     outdir=run_outdir)

        # Multi-gpu training
        # keras.utils.multi_gpu_model(model, gpus=[0, 1], cpu_merge=True, cpu_relocation=False)

        # Load the best model to make preds
        eval_metric = 'val_mean_absolute_error'
        ep_best = hh.loc[hh[eval_metric] == hh[eval_metric].min(),
                         'epoch'].values[0]
        mpath = glob(
            str(model_checkpoint_dir / f'model.ep_{ep_best}-val_loss*.h5'))[0]
        model = load_model(mpath)

        # Calc preds and scores
        # ... training set
        y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype)
        tr_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_dict(tr_scores, outpath=run_outdir / 'tr_scores.txt')
        pd.DataFrame({
            'y_true': y_true.reshape(-1),
            'y_pred': y_pred.reshape(-1, )
        }).to_csv(run_outdir / 'tr_preds.csv', index=False)
        # ... val set
        y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype)
        vl_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_dict(vl_scores, outpath=run_outdir / 'vl_scores.txt')
        pd.DataFrame({
            'y_true': y_true.reshape(-1, ),
            'y_pred': y_pred.reshape(-1, )
        }).to_csv(run_outdir / 'vl_preds.csv', index=False)

    # Calc preds and scores for test set
    y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype)
    te_scores = calc_scores(y_true=y_true,
                            y_pred=y_pred,
                            mltype=mltype,
                            metrics=None)
    dump_dict(te_scores, outpath=run_outdir / 'te_scores.txt')
    pd.DataFrame({
        'y_true': y_true.reshape(-1),
        'y_pred': y_pred.reshape(-1, )
    }).to_csv(run_outdir / 'te_preds.csv', index=False)

    lg.kill_logger()
    del xdata, ydata

    print('Done.')
    def trn_learning_curve(
            self,
            framework: str = 'lightgbm',
            mltype: str = 'reg',
            model_name: str = 'lgb_reg',  # TODO! this is redundent
            init_kwargs: dict = {},
            fit_kwargs: dict = {},
            clr_keras_kwargs: dict = {},
            metrics: list = [
                'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error',
                'neg_mean_squared_error'
            ],
            n_jobs: int = 4,
            random_state: int = None,
            plot=True):
        """ 
        Args:
            framework : ml framework (keras, lightgbm, or sklearn)
            mltype : type to ml problem (reg or cls)
            init_kwargs : dict of parameters that initialize the estimator
            fit_kwargs : dict of parameters to the estimator's fit() method
            clr_keras_kwargs : 
            metrics : allow to pass a string of metrics  TODO!
        """
        self.framework = framework
        self.mltype = mltype
        self.model_name = model_name
        self.init_kwargs = init_kwargs
        self.fit_kwargs = fit_kwargs
        self.clr_keras_kwargs = clr_keras_kwargs
        self.metrics = metrics
        self.n_jobs = n_jobs
        self.random_state = random_state

        # Start nested loop of train size and cv folds
        tr_scores_all = []  # list of dicts
        vl_scores_all = []  # list of dicts

        # CV loop
        for fold, (tr_k, vl_k) in enumerate(
                zip(self.tr_dct.keys(), self.vl_dct.keys())):
            if self.logger is not None:
                self.logger.info(f'Fold {fold+1}/{self.cv_folds}')

            tr_id = self.tr_dct[tr_k]
            vl_id = self.vl_dct[vl_k]

            # Samples from this dataset are randomly sampled for training
            xtr = self.X[tr_id, :]
            ytr = self.Y[tr_id, :]

            # A fixed set of validation samples for the current CV split
            xvl = self.X[vl_id, :]
            yvl = np.squeeze(self.Y[vl_id, :])

            # Shards loop (iterate across the dataset sizes and train)
            # np.random.seed(random_state)
            # idx = np.random.permutation(len(xtr))
            idx = np.arange(len(xtr))
            for i, tr_sz in enumerate(self.tr_shards):
                # For each shard: train model, save best model, calc tr_scores, calc_vl_scores
                if self.logger:
                    self.logger.info(
                        f'\tTrain size: {tr_sz} ({i+1}/{len(self.tr_shards)})')

                # Sequentially get a subset of samples (the input dataset X must be shuffled)
                xtr_sub = xtr[idx[:tr_sz], :]
                ytr_sub = np.squeeze(ytr[idx[:tr_sz], :])

                # Get the estimator
                estimator = ml_models.get_model(self.model_name,
                                                init_kwargs=self.init_kwargs)
                model = estimator.model

                # Train
                # self.val_split = 0 # 0.1 # used for early stopping
                self.eval_frac = 0.1  # 0.1 # used for early stopping
                eval_samples = int(self.eval_frac * xvl.shape[0])
                eval_set = (xvl[:eval_samples, :], yvl[:eval_samples])
                if self.framework == 'lightgbm':
                    model, trn_outdir = self.trn_lgbm_model(model=model,
                                                            xtr_sub=xtr_sub,
                                                            ytr_sub=ytr_sub,
                                                            fold=fold,
                                                            tr_sz=tr_sz,
                                                            eval_set=eval_set)
                elif self.framework == 'keras':
                    model, trn_outdir = self.trn_keras_model(model=model,
                                                             xtr_sub=xtr_sub,
                                                             ytr_sub=ytr_sub,
                                                             fold=fold,
                                                             tr_sz=tr_sz,
                                                             eval_set=eval_set)
                elif self.framework == 'pytorch':
                    pass
                else:
                    raise ValueError(
                        f'framework {self.framework} is not supported.')

                # Calc preds and scores TODO: dump preds
                # ... training set
                y_pred, y_true = calc_preds(model,
                                            x=xtr_sub,
                                            y=ytr_sub,
                                            mltype=self.mltype)
                tr_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                # ... val set
                y_pred, y_true = calc_preds(model,
                                            x=xvl,
                                            y=yvl,
                                            mltype=self.mltype)
                vl_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)

                del estimator, model
                # nm = ((y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
                # dn = ((y_true - np.average(y_true, axis=0)) ** 2).sum(axis=0, dtype=np.float64)

                # Add metadata
                tr_scores['tr_set'] = True
                tr_scores['fold'] = 'fold' + str(fold)
                tr_scores['tr_size'] = tr_sz

                vl_scores['tr_set'] = False
                vl_scores['fold'] = 'fold' + str(fold)
                vl_scores['tr_size'] = tr_sz

                # Append scores (dicts)
                tr_scores_all.append(tr_scores)
                vl_scores_all.append(vl_scores)

                # Dump intermediate scores
                # TODO
                pass
                scores_tmp = pd.concat(
                    [scores_to_df(tr_scores_all),
                     scores_to_df(vl_scores_all)],
                    axis=0)
                scores_tmp.to_csv(trn_outdir / ('tmp_scores.csv'), index=False)
                del trn_outdir, tmp_scores

            # Dump intermediate results (this is useful if the run terminates before run ends)
            # tr_df_tmp = scores_to_df(tr_scores_all)
            # vl_df_tmp = scores_to_df(vl_scores_all)
            scores_all_df_tmp = pd.concat(
                [scores_to_df(tr_scores_all),
                 scores_to_df(vl_scores_all)],
                axis=0)
            scores_all_df_tmp.to_csv(
                self.outdir / ('_lrn_crv_scores_cv' + str(fold + 1) + '.csv'),
                index=False)

        # Scores to df
        tr_scores_df = scores_to_df(tr_scores_all)
        vl_scores_df = scores_to_df(vl_scores_all)
        scores_df = pd.concat([tr_scores_df, vl_scores_df], axis=0)

        # Dump final results
        tr_scores_df.to_csv(self.outdir / 'tr_lrn_crv_scores.csv', index=False)
        vl_scores_df.to_csv(self.outdir / 'vl_lrn_crv_scores.csv', index=False)
        scores_df.to_csv(self.outdir / 'lrn_crv_scores.csv', index=False)

        # Plot learning curves
        if plot:
            plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir)

        return scores_df