Beispiel #1
0
def data_prep_nn1_def(xdata):
    """
    This func prepares the dataset for keras model.
    """
    x_ge = extract_subset_fea(xdata, fea_list=['ge'], fea_sep='_')
    x_dd = extract_subset_fea(xdata, fea_list=['dd'], fea_sep='_')
    x_ge = np.asarray(x_ge)
    x_dd = np.asarray(x_dd)
    x_dct = {'in_ge': x_ge, 'in_dd': x_dd}
    return x_dct
Beispiel #2
0
def trn_baseline(ml_df, fea_list=['dd'], fea_sep='_'):
    """ Train baseline model using LGBM. """
    try:
        import lightgbm as lgb
    except ImportError:
        print('Could not import lightgbm.')
        return None

    from sklearn.model_selection import train_test_split
    from datasplit.splitter import data_splitter
    from ml.evals import calc_preds, calc_scores, dump_preds
    ml_model_def = lgb.LGBMRegressor
    ml_init_args = {'n_jobs': 8}
    ml_fit_args = {'verbose': False, 'early_stopping_rounds': 10}
    model = ml_model_def(**ml_init_args)
    ydata = ml_df['reg']
    xdata = extract_subset_fea(ml_df, fea_list=fea_list, fea_sep=fea_sep)
    x_, xte, y_, yte = train_test_split(xdata, ydata, test_size=0.2)
    xtr, xvl, ytr, yvl = train_test_split(x_, y_, test_size=0.2)
    ml_fit_args['eval_set'] = (xvl, yvl)
    model.fit(xtr, ytr, **ml_fit_args)
    y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype='reg')
    te_scr = calc_scores(y_true=y_true,
                         y_pred=y_pred,
                         mltype='reg',
                         metrics=None)
    return te_scr
Beispiel #3
0
def load_mordred_descriptors(drg_set,
                             fea_name,
                             col_name,
                             drug_names=None,
                             fea_sep='_',
                             n_jobs=64,
                             N=None):
    """ Load Mordred descriptors files. The files contains subsets of
    descriptors generated on an HPC.
    """
    files_path = Path(FEA_DIR, drg_set, fea_name).resolve()
    fea_files = sorted(files_path.glob(f'{drg_set}-*.csv'))

    if len(fea_files) > 0:
        fea_prfx = 'dd'
        fea_names = pd.read_csv(FEA_DIR / 'dd_fea_names.csv').columns.tolist()
        fea_names = [c.strip() for c in fea_names]  # clean names
        fea_names = [fea_prfx + fea_sep + str(c)
                     for c in fea_names]  # prefix fea names
        cols = ['CAT', 'TITLE', 'SMILES'] + fea_names
        # cols = ['CAT', 'TITLE'] + fea_names

        t = time()
        dfs = Parallel(n_jobs=n_jobs,
                       verbose=10)(delayed(load_and_get_samples)(
                           f, cols, col_name=col_name, drug_names=drug_names)
                                   for f in fea_files[:N])
        # dfs = []
        # for ii, f in enumerate(fea_files):
        #     if ii%10 == 0:
        #         print(ii)
        #     df = load_and_get_samples(f, cols, col_name=col_name, drug_names=drug_names)
        #     dfs.append(df)
        t = time() - t

        fea_df = pd.concat(dfs, axis=0).reset_index(drop=True)
        del dfs

        # fea_df.drop(columns='SMILES', inplace=True)
        fea_df = fea_df[fea_df[col_name].notna()]
        fea_df = fea_df.drop_duplicates(subset=[col_name]).reset_index(
            drop=True)

        # Cast
        xdata = extract_subset_fea(fea_df, fea_list=['dd'], fea_sep=fea_sep)
        xdata = xdata.astype(np.float32)
        # xdata = xdata.fillna(0)
        meta = fea_df.drop(columns=xdata.columns.tolist())
        fea_df = pd.concat([meta, xdata], axis=1)
        fea_df = fea_df.reset_index(drop=True)
        del meta, xdata

        return fea_df
    else:
        return None
Beispiel #4
0
def run(args):
    import pdb
    pdb.set_trace()
    t0 = time()
    datapath = Path(args['datapath']).resolve()

    if args['max_size'] is not None:
        assert args['min_size'] < args['max_size'], f"min train size (min_size={args['min_size']}) "\
                                                    f"must be smaller than max train size "\
                                                    f"(max_size={args['max_size']})."

    if args['splitdir'] is None:
        splitdir = None
    else:
        splitdir = Path(args['splitdir']).resolve()
    split_id = args['split_id']

    # -----------------------------------------------
    #       Global outdir
    # -----------------------------------------------
    if args['gout'] is not None:
        gout = Path(args['gout']).resolve()
    else:
        gout = fdir.parent / 'lc.trn'
        gout = gout / datapath.with_suffix('.lc').name
    args['gout'] = str(gout)
    os.makedirs(gout, exist_ok=True)

    # -----------------------------------------------
    #       Run (single split) outdir
    # -----------------------------------------------
    if args['rout'] is not None:
        rout = gout / args['rout']
    else:
        if splitdir is None:
            rout = gout / 'run_0'
        else:
            rout = gout / f'split_{split_id}'
    args['rout'] = str(rout)
    os.makedirs(rout, exist_ok=True)

    # -----------------------------------------------
    #       Logger
    # -----------------------------------------------
    lg = Logger(rout / 'lc.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {fdir}')
    print_fn(f'\n{pformat(args)}')
    dump_dict(args, outpath=rout / 'trn.args.txt')

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn('\nLoad master dataset.')
    data = load_data(datapath)
    print_fn('data.shape {}'.format(data.shape))

    # Get features (x), target (y), and meta
    fea_list = args['fea_prfx']
    fea_sep = args['fea_sep']
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep=fea_sep)
    meta = data.drop(columns=xdata.columns)
    ydata = meta[[args['trg_name']]]
    del data

    # -----------------------------------------------
    #       Scale features
    # -----------------------------------------------
    xdata = scale_fea(xdata=xdata, scaler_name=args['scaler'])

    # -----------------------------------------------
    #       Data splits
    # -----------------------------------------------
    if splitdir is None:
        cv_lists = None
    else:
        split_pattern = f'1fold_s{split_id}_*_id.csv'
        single_split_files = glob(str(splitdir / split_pattern))

        # Get indices for the split
        for id_file in single_split_files:
            if 'tr_id' in id_file:
                tr_id = load_data(id_file).values.reshape(-1, )
            elif 'vl_id' in id_file:
                vl_id = load_data(id_file).values.reshape(-1, )
            elif 'te_id' in id_file:
                te_id = load_data(id_file).values.reshape(-1, )

        cv_lists = (tr_id, vl_id, te_id)

    # -----------------------------------------------
    #      ML model configs
    # -----------------------------------------------
    if args['ml'] == 'lgb':
        # LGBM regressor model definition
        import lightgbm as lgb
        framework = 'lightgbm'
        ml_model_def = lgb.LGBMRegressor
        mltype = 'reg'

        ml_init_kwargs = {
            'n_estimators': args['n_estimators'],
            'max_depth': args['max_depth'],
            'learning_rate': args['learning_rate'],
            'num_leaves': args['num_leaves'],
            'n_jobs': args['n_jobs'],
            'random_state': None
        }
        ml_fit_kwargs = {'verbose': False, 'early_stopping_rounds': 10}
        data_prep_def = None
        keras_callbacks_def = None
        keras_clr_kwargs = None

    elif args['ml'] == 'nn_reg0':
        # Keras model def
        from models.keras_model import (nn_reg0_model_def, data_prep_nn0_def,
                                        model_callback_def)
        framework = 'keras'
        mltype = 'reg'
        keras_callbacks_def = model_callback_def
        data_prep_def = data_prep_nn0_def

        if (args['ml'] == 'nn_reg0'):
            ml_model_def = nn_reg0_model_def

        ml_init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': args['dr_rate'],
            'opt_name': args['opt'],
            'lr': args['lr'],
            'batchnorm': args['batchnorm']
        }
        ml_fit_kwargs = {
            'epochs': args['epoch'],
            'batch_size': args['batch_size'],
            'verbose': 1
        }
        keras_clr_kwargs = {}

    elif args['ml'] == 'nn_reg1':
        from models.keras_model import (nn_reg1_model_def, data_prep_nn1_def,
                                        model_callback_def)
        framework = 'keras'
        mltype = 'reg'
        keras_callbacks_def = model_callback_def
        data_prep_def = data_prep_nn1_def

        if (args['ml'] == 'nn_reg1'):
            ml_model_def = nn_reg1_model_def

        x_ge = extract_subset_fea(xdata, fea_list=['ge'], fea_sep='_')
        x_dd = extract_subset_fea(xdata, fea_list=['dd'], fea_sep='_')

        ml_init_kwargs = {
            'in_dim_ge': x_ge.shape[1],
            'in_dim_dd': x_dd.shape[1],
            'dr_rate': args['dr_rate'],
            'opt_name': args['opt'],
            'lr': args['lr'],
            'batchnorm': args['batchnorm']
        }
        ml_fit_kwargs = {
            'epochs': args['epoch'],
            'batch_size': args['batch_size'],
            'verbose': 1
        }
        keras_clr_kwargs = {}
        del x_ge, x_dd

    # Print NN
    if len(ml_init_kwargs) and ('nn' in args['ml']):
        model = ml_model_def(**ml_init_kwargs)
        model.summary(print_fn=lg.logger.info)
        del model

    # -----------------------------------------------
    #      Learning curve
    # -----------------------------------------------
    # LC args
    lc_init_args = {
        'cv_lists': cv_lists,
        'n_splits': args['n_splits'],
        'mltype': mltype,
        'lc_step_scale': args['lc_step_scale'],
        'lc_sizes': args['lc_sizes'],
        'min_size': args['min_size'],
        'max_size': args['max_size'],
        'lc_sizes_arr': args['lc_sizes_arr'],
        'outdir': rout,
        'print_fn': print_fn
    }

    lc_trn_args = {
        'framework': framework,
        'n_jobs': args['n_jobs'],
        'ml_model_def': ml_model_def,
        'ml_init_args': ml_init_kwargs,
        'ml_fit_args': ml_fit_kwargs,
        'data_prep_def': data_prep_def,
        'keras_callbacks_def': keras_callbacks_def,
        'keras_clr_args': keras_clr_kwargs
    }

    # LC object
    lc_obj = LearningCurve(X=xdata, Y=ydata, meta=meta, **lc_init_args)
    lc_scores = lc_obj.trn_learning_curve(**lc_trn_args)

    # Dump all scores
    lc_scores.to_csv(rout / 'lc_scores.csv', index=False)

    # Dump args
    dump_dict(args, outpath=rout / 'args.txt')

    # ------------------------------------------------------
    if (time() - t0) // 3600 > 0:
        print_fn('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))
    else:
        print_fn('Runtime: {:.1f} mins'.format((time() - t0) / 60))

    print_fn('Done.')
    lg.close_logger()
    del xdata, ydata

    return None
def gen_ml_df(dd, trg_name, meta_cols=['TITLE', 'SMILES'], fea_list=['dsc'],
              score_name='reg', q_cls=0.025, bin_th=2.0, print_fn=print,
              outdir=Path('out'), outfigs=Path('outfigs')):
    """ Generate a single ML dataframe for the specified target column trg_name.
    Args:
        dd : dataframe with (molecules x targets) where the first col is TITLE
        trg_name : a column in dd representing the target 
        meta_cols : metadata columns to include in the dataframe
        score_name : rename the trg_name with score_name
        q_cls : quantile value to compute along the docking scores to generate the 'cls' col
        bin_th : threshold value of docking score to generate the 'binner' col
    
    Returns:
        dd_trg : the ML dataframe 
    """
    print_fn( f'Processing {trg_name} ...' )
    res = {}
    res['target'] = trg_name

    meta_cols = set(meta_cols).intersection(set(dd.columns.tolist()))
    meta_cols = [i for i in meta_cols]

    # fea_list = ['dsc', 'ecfp2', 'ecfp4', 'ecfp6']
    # fea_list = ['dsc']
    fea_sep = '.'
    fea_cols = extract_subset_fea_col_names(dd, fea_list=fea_list, fea_sep=fea_sep)
    cols = [trg_name] + meta_cols + fea_cols
    dd_trg = dd[ cols ]
    del dd

    # Drop NaN scores
    dd_trg = dd_trg[ ~dd_trg[trg_name].isna() ].reset_index(drop=True)

    # Rename the scores col
    dd_trg = dd_trg.rename( columns={trg_name: score_name} )

    # File name
    fname = 'ml.' + trg_name
    
    # Transform scores to positive
    dd_trg[score_name] = abs( np.clip(dd_trg[score_name], a_min=None, a_max=0) )
    res['min'], res['max'] = dd_trg[score_name].min(), dd_trg[score_name].max()
    bins = 50
    """
    p = dd[score_name].hist(bins=bins);
    p.set_title(f'Scores Clipped to 0: {fname}');
    p.set_ylabel('Count'); p.set_xlabel('Docking Score');
    plt.savefig(outfigs/f'dock_scores_clipped_{fname}.png');
    """
    
    # Add binner
    binner = [1 if x>=bin_th else 0 for x in dd_trg[score_name]]
    dd_trg.insert(loc=1, column='binner', value=binner)

    # -----------------------------------------    
    # Create binner
    # -----------------------------------------      
    # Find quantile value
    if dd_trg[score_name].min() >= 0: # if scores were transformed to >=0
        q_cls = 1.0 - q_cls
    cls_th = dd_trg[score_name].quantile(q=q_cls)
    res['cls_th'] = cls_th
    print_fn('Quantile score (q_cls={:.3f}): {:.3f}'.format( q_cls, cls_th ))

    # Generate a classification target col
    if dd_trg[score_name].min() >= 0: # if scores were transformed to >=0
        value = (dd_trg[score_name] >= cls_th).astype(int)
    else:
        value = (dd_trg[score_name] <= cls_th).astype(int)
    dd_trg.insert(loc=1, column='cls', value=value)
    # print_fn('Ratio {:.3f}'.format( dd['dock_bin'].sum() / dd.shape[0] ))

    # Plot
    hist, bin_edges = np.histogram(dd_trg[score_name], bins=bins)
    x = np.ones((10,)) * cls_th
    y = np.linspace(0, hist.max(), len(x))

    fig, ax = plt.subplots()
    plt.hist(dd_trg[score_name], bins=bins, density=False, facecolor='b', alpha=0.5)
    plt.title(f'Scores Clipped to 0: {fname}');
    plt.ylabel('Count'); plt.xlabel('Docking Score');
    plt.plot(x, y, 'r--', alpha=0.7, label=f'{q_cls}-th quantile')
    plt.grid(True)
    plt.savefig(outfigs/f'dock.score.bin.{fname}.png')

    # Separate the features
    def extract_and_save_fea( df, fea, to_csv=False ):
        """ Extract specific feature type (including metadata) and
        save to file. 
        """
        name = fea
        fea_prfx_drop = [i for i in fea_list if i!=fea]
        fea_cols_drop = extract_subset_fea_col_names(df, fea_list=fea_prfx_drop, fea_sep=fea_sep)
        data = df.drop( columns=fea_cols_drop )
        outpath_name = outdir/(fname+f'.{name}')
        data.to_parquet( str(outpath_name)+'.parquet' )
        if to_csv:
            data.to_csv( str(outpath_name)+'.csv', index=False )
        return data

    print_fn( f'Create and save dataframes ...' )
    for fea in fea_list:
        to_csv = False if 'dsc' in fea else True
        dsc_df = extract_and_save_fea( dd_trg, fea=fea, to_csv=to_csv )

    # Scale desciptors and save scaler (save raw features rather the scaled)
    if sum([True for i in fea_list if 'dsc' in i]):
        dsc_prfx = ('dsc'+fea_sep)
        from sklearn.preprocessing import StandardScaler
        import joblib
        xdata = extract_subset_fea(dsc_df, fea_list='dsc', fea_sep=fea_sep)
        cols = xdata.columns
        sc = StandardScaler( with_mean=True, with_std=True )
        sc.fit( xdata )
        sc_outpath = outdir/(fname+f'.dsc.scaler.pkl')
        joblib.dump(sc, sc_outpath)
        # sc_ = joblib.load( sc_outpath ) 

        # We decided to remove the feature-specific prefixes for descriptors
        dsc_df = dsc_df.rename(columns={c: c.split(dsc_prfx)[-1] if dsc_prfx in c else c for c in dsc_df.columns})
        dsc_df.to_csv( outdir/(fname+'.dsc.csv'), index=False)        

    try:
        import lightgbm as lgb
        from sklearn.model_selection import train_test_split
        from datasplit.splitter import data_splitter
        from ml.evals import calc_preds, calc_scores, dump_preds
        ml_model_def = lgb.LGBMRegressor
        ml_init_args = {'n_jobs': 8}
        ml_fit_args = {'verbose': False, 'early_stopping_rounds': 10}
        model = ml_model_def( **ml_init_args )
        ydata = dd_trg['reg']
        xdata = extract_subset_fea(dd_trg, fea_list=fea_list, fea_sep=fea_sep)
        x_, xte, y_, yte = train_test_split(xdata, ydata, test_size=0.2)
        xtr, xvl, ytr, yvl = train_test_split(x_, y_, test_size=0.2)
        ml_fit_args['eval_set'] = (xvl, yvl)
        model.fit(xtr, ytr, **ml_fit_args)
        y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype='reg')
        te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype='reg', metrics=None)
        res['r2'] = te_scores['r2']
        res['mae'] = te_scores['median_absolute_error']
    except:
        print('Could not import lightgbm.')

    return res