def run(args):
    dirpath = Path(args['dirpath'])

    # Data splits
    # te_method = args['te_method']
    # cv_method = args['cv_method']
    # te_size = split_size(args['te_size'])
    vl_size = split_size(args['vl_size'])

    # Features 
    cell_fea = args['cell_fea']
    drug_fea = args['drug_fea']
    fea_list = cell_fea + drug_fea
    
    # Other params
    n_jobs = args['n_jobs']

    # Hard split
    # grp_by_col = None
    split_on = args['split_on'] if args['split_on'] is None else args['split_on'].upper()
    cv_method = 'simple' if split_on is None else 'group'
    te_method = cv_method 

    # TODO: this needs to be improved
    mltype = 'reg'  # required for the splits (stratify in case of classification)
    
    
    # -----------------------------------------------
    #       Create outdir and logger
    # -----------------------------------------------
    outdir = Path( str(dirpath) + '_splits' )
    os.makedirs(outdir, exist_ok=True)
    
    lg = Logger(outdir/'splitter.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    # Dump args to file
    dump_dict(args, outpath=outdir/'args.txt')

    
    # -----------------------------------------------
    #       Load and break data
    # -----------------------------------------------
    lg.logger.info('\nLoad master dataset.')
    files = list(dirpath.glob('**/*.parquet'))
    if len(files) > 0: data = pd.read_parquet( files[0], engine='auto', columns=None ) # TODO: assumes that there is only one data file
    lg.logger.info('data.shape {}'.format(data.shape))

    # Split features and traget, and dump to file
    lg.logger.info('\nSplit features and meta.')
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_')
    meta = data.drop(columns=xdata.columns)
    xdata.to_parquet( outdir/'xdata.parquet' )
    meta.to_parquet( outdir/'meta.parquet' )
    
    lg.logger.info('Total DD: {}'.format( len([c for c in xdata.columns if 'DD_' in c]) ))
    lg.logger.info('Total GE: {}'.format( len([c for c in xdata.columns if 'GE_' in c]) ))
    lg.logger.info('Unique cells: {}'.format( meta['CELL'].nunique() ))
    lg.logger.info('Unique drugs: {}'.format( meta['DRUG'].nunique() ))
    # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger)

    plot_hist(meta['AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_all.png')
    

    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    """
    np.random.seed(SEED)
    idx_vec = np.random.permutation(xdata.shape[0])

    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        # Convert vl_size into int
        # vl_size_int = int(vl_size*len(idx_vec))

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
                         mltype=mltype, shuffle=False, random_state=SEED)

        # Command meta[split_on].values[idx_vec] returns the vector meta[split_on].values
        # in an order specified by idx_vec
        # For example:
        # aa = meta[split_on][:3]
        # print(aa.values)
        # print(aa.values[[0,2,1]])
        # m = meta[split_on]
        cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None
        if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    
        tr_folds = {} 
        vl_folds = {} 
        te_folds = {}
        
        # Start CV iters
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
            lg.logger.info(f'\nFold {fold+1}')
            tr_id = idx_vec[tr_id] # adjust the indices!
            vl_id = idx_vec[vl_id] # adjust the indices!
            # t = meta.loc[tr_id, split_on]
            # v = meta.loc[vl_id, split_on]
            # print(len(vl_id)/len(idx_vec))
            
            # -----------------
            # Store tr ids
            tr_folds[fold] = tr_id.tolist()

            # Create splitter that splits vl into vl and te (splits by half)
            te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=0.5,
                                      mltype=mltype, shuffle=False, random_state=SEED)

            # Update the index array
            idx_vec_ = vl_id; del vl_id

            te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None
            if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

            # Split vl set into vl and te
            vl_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            vl_id = idx_vec_[vl_id] # adjust the indices!
            te_id = idx_vec_[te_id] # adjust the indices!
            # v = meta.loc[vl_id, split_on]
            # e = meta.loc[te_id, split_on]

            # Store vl and te ids
            vl_folds[fold] = vl_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] ))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] ))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] ))

            # Confirm that group splits are correct
            if split_on is not None:
                tr_grp_unq = set(meta.loc[tr_id, split_on])
                vl_grp_unq = set(meta.loc[vl_id, split_on])
                te_grp_unq = set(meta.loc[te_id, split_on])
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
        vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
        te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ]))

        # Dump
        tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
        vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
        te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False )
        
        # Plot target dist only for the 1-fold case
        # TODO: consider to plot dist for all k-fold where k>1
        if cv_folds==1 and fold==0:
            plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
            plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
            plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
            
            plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
                              title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
            
            # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
            # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
            # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')
    """


    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    np.random.seed(SEED)
    idx_vec = np.random.permutation(xdata.shape[0])

    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
                         mltype=mltype, shuffle=False, random_state=SEED)

        cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None
        if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    
        tr_folds = {} 
        vl_folds = {} 
        te_folds = {}
        
        # Start CV iters (this for loop generates the tr and vl splits)
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
            lg.logger.info(f'\nFold {fold}')
            tr_id = idx_vec[tr_id] # adjust the indices!
            vl_id = idx_vec[vl_id] # adjust the indices!

            # -----------------
            # Store vl ids
            vl_folds[fold] = vl_id.tolist()

            # Update te_size to the new full size of available samples
            if cv_folds == 1:
                te_size_ = vl_size / (1 - vl_size)
            else:
                te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec))

            # Create splitter that splits tr into tr and te
            te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_,
                                      mltype=mltype, shuffle=False, random_state=SEED)

            # Update the index array
            idx_vec_ = tr_id; del tr_id

            te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None
            if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

            # Split tr into tr and te
            tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            tr_id = idx_vec_[tr_id] # adjust the indices!
            te_id = idx_vec_[te_id] # adjust the indices!

            # Store tr and te ids
            tr_folds[fold] = tr_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] ))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] ))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] ))

            # Confirm that group splits are correct
            if split_on is not None:
                tr_grp_unq = set(meta.loc[tr_id, split_on])
                vl_grp_unq = set(meta.loc[vl_id, split_on])
                te_grp_unq = set(meta.loc[te_id, split_on])
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
        vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
        te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ]))

        # Dump
        tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
        vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
        te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False )
        
        # Plot target dist only for the 1-fold case
        # TODO: consider to plot dist for all k-fold where k>1
        if cv_folds==1 and fold==0:
            plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
            plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
            plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
            
            plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
                              title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
            
            # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
            # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
            # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')



#     # -----------------------------------------------
#     #       Train-test split
#     # -----------------------------------------------
#     np.random.seed(SEED)
#     idx_vec = np.random.permutation(xdata.shape[0])
# 
#     if te_method is not None:
#         lg.logger.info('\nSplit train/test.')
#         te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size,
#                                   mltype=mltype, shuffle=False, random_state=SEED)
# 
#         te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None
#         if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)
#    
#         # Split train/test
#         tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
#         tr_id = idx_vec[tr_id] # adjust the indices!
#         te_id = idx_vec[te_id] # adjust the indices!
# 
#         pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] )
#         pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] )
#         
#         lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] ))
#         lg.logger.info('Test:  {:.1f}'.format( len(te_id)/xdata.shape[0] ))
#         
#         # Update the master idx vector for the CV splits
#         idx_vec = tr_id
# 
#         # Plot dist of responses (TODO: this can be done to all response metrics)
#         # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values,
#         #         title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png')
# 
#         # Confirm that group splits are correct
#         if te_method=='group' and grp_by_col is not None:
#             tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
#             te_grp_unq = set(meta.loc[te_id, grp_by_col])
#             lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
#             lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.')
# 
#         # Update vl_size to effective vl_size
#         vl_size = vl_size * xdata.shape[0]/len(tr_id)
#         
#         # Plot hist te
#         pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')
#         plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
# 
#         del tr_id, te_id
# 
# 
#     # -----------------------------------------------
#     #       Generate CV splits
#     # -----------------------------------------------
#     cv_folds_list = [1, 5, 7, 10, 15, 20, 25]
#     lg.logger.info(f'\nStart CV splits ...')
#     
#     for cv_folds in cv_folds_list:
#         lg.logger.info(f'\nCV folds: {cv_folds}')
# 
#         cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
#                          mltype=mltype, shuffle=False, random_state=SEED)
# 
#         cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None
#         if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
#     
#         tr_folds = {}
#         vl_folds = {}
# 
#         # Start CV iters
#         for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
#             tr_id = idx_vec[tr_id] # adjust the indices!
#             vl_id = idx_vec[vl_id] # adjust the indices!
# 
#             tr_folds[fold] = tr_id.tolist()
#             vl_folds[fold] = vl_id.tolist()
# 
#             # Confirm that group splits are correct
#             if cv_method=='group' and grp_by_col is not None:
#                 tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
#                 vl_grp_unq = set(meta.loc[vl_id, grp_by_col])
#                 lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
#                 lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
#                 lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
#         
#         # Convet to df
#         # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
#         # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
#         # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
#         tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
#         vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
# 
#         # Dump
#         tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
#         vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
#         
#         # Plot target dist only for the 1-fold case
#         if cv_folds==1 and fold==0:
#             plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
#             plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
#             
#             plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
#                               title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
#             
#             pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
#             pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
 
    lg.kill_logger()
    print('Done.')
Example #2
0
def run(args):
    dirpath = Path(args['dirpath'])
    target_name = args['target_name']
    cv_folds = args['cv_folds']

    # Features
    # cell_fea = args['cell_fea']
    # drug_fea = args['drug_fea']
    # fea_list = cell_fea + drug_fea

    # NN params
    epochs = args['epochs']
    batch_size = args['batch_size']
    dr_rate = args['dr_rate']

    # Optimizer
    opt_name = args['opt']
    clr_keras_kwargs = {
        'mode': args['clr_mode'],
        'base_lr': args['clr_base_lr'],
        'max_lr': args['clr_max_lr'],
        'gamma': args['clr_gamma']
    }

    # Other params
    model_name = args['model_name']
    skp_ep = args['skp_ep']
    n_jobs = args['n_jobs']

    # ML type ('reg' or 'cls')
    if 'reg' in model_name:
        mltype = 'reg'
    elif 'cls' in model_name:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    src = dirpath.name.split('_')[0]

    # -----------------------------------------------
    #       Create outdir and logger
    # -----------------------------------------------
    outdir = Path(str(dirpath).split('_')[0] + '_trn')
    # os.makedirs(outdir, exist_ok=True)
    run_outdir = create_outdir(outdir, args, src)
    lg = Logger(run_outdir / 'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    # Dump args to file
    dump_dict(args, outpath=run_outdir / 'args.txt')

    # -----------------------------------------------
    #       Load data and pre-proc
    # -----------------------------------------------
    def get_file(fpath):
        return pd.read_csv(
            fpath, header=None).squeeze().values if fpath.is_file() else None

    def read_data_file(fpath, file_format='csv'):
        fpath = Path(fpath)
        if fpath.is_file():
            if file_format == 'csv':
                df = pd.read_csv(fpath)
            elif file_format == 'parquet':
                df = pd.read_parquet(fpath)
        else:
            df = None
        return df

    # Data splits
    tr_id = pd.read_csv(dirpath / f'{cv_folds}fold_tr_id.csv')
    vl_id = pd.read_csv(dirpath / f'{cv_folds}fold_vl_id.csv')
    te_id = pd.read_csv(dirpath / 'te_id.csv')

    tr_dct = {}
    vl_dct = {}

    for fold in range(tr_id.shape[1]):
        tr_dct[fold] = tr_id.iloc[:, fold].dropna().values.astype(int).tolist()
        vl_dct[fold] = vl_id.iloc[:, fold].dropna().values.astype(int).tolist()

    te_id = te_id.iloc[:, 0].dropna().values.astype(int).tolist()

    # Load data
    lg.logger.info(f'\nLoading data ...')
    xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet')
    meta = read_data_file(dirpath / 'meta.parquet', 'parquet')
    ydata = meta[[target_name]]

    # Scale
    lg.logger.info(f'\nScaling data ...')
    scaler = args['scaler']
    if scaler is not None:
        if scaler == 'stnd':
            scaler = StandardScaler()
        elif scaler == 'minmax':
            scaler = MinMaxScaler()
        elif scaler == 'rbst':
            scaler = RobustScaler()

    cols = xdata.columns
    xdata = pd.DataFrame(scaler.fit_transform(xdata),
                         columns=cols,
                         dtype=np.float32)

    # Test set
    xte = xdata.iloc[te_id, :]
    yte = np.squeeze(ydata.iloc[te_id, :]).values

    # -----------------------------------------------
    #      ML model configs
    # -----------------------------------------------
    if model_name == 'lgb_reg':
        framework = 'lightgbm'
        init_kwargs = {
            'n_jobs': n_jobs,
            'random_state': SEED,
            'logger': lg.logger
        }
        fit_kwargs = {'verbose': False}
    elif model_name == 'nn_reg':
        framework = 'keras'
        init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'attn': attn,
            'logger': lg.logger
        }
        fit_kwargs = {'batch_size': batch_size, 'epochs': epochs, 'verbose': 1}
    elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg2':
        framework = 'keras'
        init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'logger': lg.logger
        }
        fit_kwargs = {
            'batch_size': batch_size,
            'epochs': epochs,
            'verbose': 1
        }  # 'validation_split': 0.1
    elif model_name == 'nn_reg3' or 'nn_reg4':
        framework = 'keras'
        init_kwargs = {
            'in_dim_rna': None,
            'in_dim_dsc': None,
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'logger': lg.logger
        }
        fit_kwargs = {
            'batch_size': batch_size,
            'epochs': epochs,
            'verbose': 1
        }  # 'validation_split': 0.1

    # -----------------------------------------------
    #      Train
    # -----------------------------------------------
    lg.logger.info('\n\n{}'.format('=' * 50))
    lg.logger.info(f'Train {src} ...')
    lg.logger.info('=' * 50)

    # CV loop
    for fold, (tr_k, vl_k) in enumerate(zip(tr_dct.keys(), vl_dct.keys())):
        if lg.logger is not None: lg.logger.info(f'Fold {fold+1}/{cv_folds}')

        tr_id = tr_dct[tr_k]
        vl_id = vl_dct[vl_k]

        # Samples from this dataset are randomly sampled for training
        xtr = xdata.iloc[tr_id, :]
        ytr = np.squeeze(ydata.iloc[tr_id, :]).values

        # A fixed set of validation samples for the current CV split
        xvl = xdata.iloc[vl_id, :]
        yvl = np.squeeze(ydata.iloc[vl_id, :]).values

        # Get the estimator
        estimator = ml_models.get_model(model_name, init_kwargs=init_kwargs)
        model = estimator.model

        keras.utils.plot_model(model, to_file=run_outdir / 'nn_model.png')

        # Callbacks
        # keras_callbacks = define_keras_callbacks(run_outdir)
        model_checkpoint_dir = run_outdir / 'models'
        os.makedirs(model_checkpoint_dir, exist_ok=True)
        checkpointer = ModelCheckpoint(str(
            model_checkpoint_dir /
            'model.ep_{epoch:d}-val_loss_{val_loss:.4f}-val_mae_{val_mean_absolute_error:.4f}.h5'
        ),
                                       save_best_only=False)
        csv_logger = CSVLogger(run_outdir / 'training.log')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.75,
                                      patience=20,
                                      verbose=1,
                                      mode='auto',
                                      min_delta=0.0001,
                                      cooldown=3,
                                      min_lr=0.000000001)
        early_stop = EarlyStopping(monitor='val_loss', patience=60, verbose=1)
        keras_callbacks = [checkpointer, csv_logger, early_stop, reduce_lr]

        if clr_keras_kwargs['mode'] is not None:
            keras_callbacks.append(
                ml_models.clr_keras_callback(**clr_keras_kwargs))

        # Fit params
        fit_kwargs['validation_data'] = (xvl, yvl)
        fit_kwargs['callbacks'] = keras_callbacks

        # Train
        t0 = time()
        history = model.fit(xtr, ytr, **fit_kwargs)
        lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))

        # Dump model, history, plots
        model.save(str(run_outdir / 'model_final.h5'))
        hh = ml_models.save_krs_history(history, outdir=run_outdir)
        ml_models.plot_prfrm_metrics(history,
                                     title=f'Training',
                                     skp_ep=skp_ep,
                                     add_lr=True,
                                     outdir=run_outdir)

        # Multi-gpu training
        # keras.utils.multi_gpu_model(model, gpus=[0, 1], cpu_merge=True, cpu_relocation=False)

        # Load the best model to make preds
        eval_metric = 'val_mean_absolute_error'
        ep_best = hh.loc[hh[eval_metric] == hh[eval_metric].min(),
                         'epoch'].values[0]
        mpath = glob(
            str(model_checkpoint_dir / f'model.ep_{ep_best}-val_loss*.h5'))[0]
        model = load_model(mpath)

        # Calc preds and scores
        # ... training set
        y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype)
        tr_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_dict(tr_scores, outpath=run_outdir / 'tr_scores.txt')
        pd.DataFrame({
            'y_true': y_true.reshape(-1),
            'y_pred': y_pred.reshape(-1, )
        }).to_csv(run_outdir / 'tr_preds.csv', index=False)
        # ... val set
        y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype)
        vl_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_dict(vl_scores, outpath=run_outdir / 'vl_scores.txt')
        pd.DataFrame({
            'y_true': y_true.reshape(-1, ),
            'y_pred': y_pred.reshape(-1, )
        }).to_csv(run_outdir / 'vl_preds.csv', index=False)

    # Calc preds and scores for test set
    y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype)
    te_scores = calc_scores(y_true=y_true,
                            y_pred=y_pred,
                            mltype=mltype,
                            metrics=None)
    dump_dict(te_scores, outpath=run_outdir / 'te_scores.txt')
    pd.DataFrame({
        'y_true': y_true.reshape(-1),
        'y_pred': y_pred.reshape(-1, )
    }).to_csv(run_outdir / 'te_preds.csv', index=False)

    lg.kill_logger()
    del xdata, ydata

    print('Done.')
Example #3
0
def run(args):
    # Global outdir
    gout = Path(args['global_outdir'])
    os.makedirs(gout, exist_ok=True)

    # dirpath = verify_dirpath(args['dirpath'])
    data = read_data_file(filepath / args['filepath'], 'parquet')
    print('data.shape', data.shape)

    # Get features (x), target (y), and meta
    fea_list = args['cell_fea'] + args['drug_fea']
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_')
    meta = data.drop(columns=xdata.columns)
    ydata = meta[[args['target_name']]]
    del data

    # ML type ('reg' or 'cls')
    if 'reg' in args['model_name']:
        mltype = 'reg'
    elif 'cls' in args['model_name']:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    # Create logger
    lg = Logger(gout / 'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    def get_unq_split_ids(all_splits_path):
        """ List containing the full path of each split. """
        unq = [
            all_splits_path[i].split(os.sep)[-1].split('_')[1]
            for i, p in enumerate(all_splits_path)
        ]
        # unq = []
        # for i, p in enumerate(all_splits_path):
        #     sp = all_splits_path[i].split(os.sep)[-1].split('_')[1]
        # unq.append(sp)
        unq = np.unique(unq)
        return unq

    all_splits_path = glob(str(Path(args['splitpath']) / '1fold_*_id.csv'))
    unq_split_ids = get_unq_split_ids(all_splits_path)
    run_times = []

    # Append scores (dicts)
    tr_scores_all = []
    vl_scores_all = []
    te_scores_all = []

    # Sample size at each run
    smp_sz = []
    file_smp_sz = open(gout / 'sample_sz', 'w')
    file_smp_sz.write('run\ttr_sz\tvl_sz\tte_sz\n')

    # Iterate over splits
    n_splits = None if args['n_splits'] is None else (args['n_splits'] + 1)
    for i, split_id in enumerate(unq_split_ids[:n_splits]):
        # print(f'Split {split_id}')

        # Get indices for the split
        aa = [p for p in all_splits_path if f'1fold_{split_id}' in p]
        if len(aa) < 2:
            print(f'The split {s} contains only one file.')
            continue
        for id_file in aa:
            if 'tr_id' in id_file:
                tr_id = read_data_file(id_file)
            # elif 'vl_id' in id_file:
            #     # vl_id = read_data_file( id_file )
            #     te_id = read_data_file( id_file )
            elif 'vl_id' in id_file:
                vl_id = read_data_file(id_file)
            elif 'te_id' in id_file:
                te_id = read_data_file(id_file)

        # Define run outdir
        rout = gout / f'run_{split_id}'
        os.makedirs(rout, exist_ok=True)

        # Scaling
        # xdata = scale_fea(xdata=xdata, scaler_name=args['scaler'])  # scale features

        # Get training and val data
        # Extract Train set T, Validation set V, and Test set E
        tr_id = tr_id.iloc[:, 0].values.astype(int).tolist()
        vl_id = vl_id.iloc[:, 0].values.astype(int).tolist()
        te_id = te_id.iloc[:, 0].values.astype(int).tolist()
        xtr, ytr, mtr = get_data_by_id(
            tr_id, xdata, ydata,
            meta)  # samples from xtr are sequentially sampled for TRAIN
        xvl, yvl, mvl = get_data_by_id(
            vl_id, xdata, ydata,
            meta)  # fixed set of VAL samples for the current CV split
        xte, yte, mte = get_data_by_id(
            te_id, xdata, ydata,
            meta)  # fixed set of TEST samples for the current CV split

        # Extract val data
        # from sklearn.model_selection import train_test_split
        # id_arr = np.arange(len(xtr))
        # tr_, vl_ = train_test_split(id_arr, test_size=0.1)
        # xvl = xtr.iloc[vl_,:].reset_index(drop=True)
        # xtr = xtr.iloc[tr_,:].reset_index(drop=True)
        # mvl = mtr.iloc[vl_,:].reset_index(drop=True)
        # mtr = mtr.iloc[tr_,:].reset_index(drop=True)
        # yvl = ytr.iloc[vl_].reset_index(drop=True)
        # ytr = ytr.iloc[tr_].reset_index(drop=True)

        # Remove AUC gap
        min_gap = args['min_gap']
        max_gap = args['max_gap']
        if (min_gap is not None) & (max_gap is not None):
            idx = (ytr.values > min_gap) & (ytr.values < max_gap)
            xtr = xtr[~idx]
            mtr = mtr[~idx]
            ytr = ytr[~idx]

        def drop_samples(x_df, y_df, m_df, items_to_drop, drop_by: str):
            """
            Args:
                drop_by : col in df ('CELL', 'DRUG', 'CTYPE')
            """
            id_drop = m_df[drop_by].isin(items_to_drop)
            x_df = x_df[~id_drop].reset_index(drop=True)
            y_df = y_df[~id_drop].reset_index(drop=True)
            m_df = m_df[~id_drop].reset_index(drop=True)
            return x_df, y_df, m_df

        # Dump cell lines
        # if args['cell_list_drop'] is not None:
        #     cell_to_drop_fpath = Path(args['cell_list_drop'])
        # cell_to_drop_fname = 'cell_list_tmp'
        # cell_to_drop_fpath = filepath / cell_to_drop_fname
        if args['cell_list_drop'] is not None:
            cell_to_drop_fpath = Path(args['cell_list_drop'])
            if cell_to_drop_fpath.exists():
                # with open(cell_to_drop_fpath, 'r') as f:
                with open(cell_to_path_fpath, 'r') as f:
                    cells_to_drop = [line.rstrip() for line in f]
                    xtr, ytr, mtr = drop_samples(x_df=xtr,
                                                 y_df=ytr,
                                                 m_df=mtr,
                                                 items_to_drop=cells_to_drop)
                    xvl, yvl, mvl = drop_samples(x_df=xvl,
                                                 y_df=yvl,
                                                 m_df=mvl,
                                                 items_to_drop=cells_to_drop)
                    xte, yte, mte = drop_samples(x_df=xte,
                                                 y_df=yte,
                                                 m_df=mte,
                                                 items_to_drop=cells_to_drop)

        line = 's{}\t{}\t{}\t{}\n'.format(split_id, xtr.shape[0], xvl.shape[0],
                                          xte.shape[0])
        file_smp_sz.write(line)

        # Adjust the responses
        if mltype == 'cls':
            ytr = bin_rsp(ytr, resp_thres=0.5)
            yvl = bin_rsp(yvl, resp_thres=0.5)
            yte = bin_rsp(yte, resp_thres=0.5)

        # Define ML model
        if 'lgb' in args['model_name']:
            args['framework'] = 'lightgbm'
        elif args['model_name'] == 'rf_reg':
            args['framework'] = 'sklearn'
        elif 'nn_' in args['model_name']:
            args['framework'] = 'keras'

        model_init_kwargs, model_fit_kwargs = get_model_kwargs(args)

        # Get the estimator
        estimator = ml_models.get_model(args['model_name'],
                                        init_kwargs=model_init_kwargs)
        model = estimator.model

        # Train
        eval_set = (xvl, yvl)
        # eval_set = None
        if args['framework'] == 'lightgbm':
            model, runtime = trn_lgbm_model(model=model,
                                            xtr=xtr,
                                            ytr=ytr,
                                            eval_set=eval_set,
                                            fit_kwargs=model_fit_kwargs)
        elif args['framework'] == 'sklearn':
            model, runtime = trn_sklearn_model(model=model,
                                               xtr_sub=xtr,
                                               ytr_sub=ytr,
                                               eval_set=None,
                                               fit_kwargs=model_fit_kwargs)
        elif args['framework'] == 'keras':
            model, runtime = trn_keras_model(model=model,
                                             xtr_sub=xtr,
                                             ytr_sub=ytr,
                                             eval_set=eval_set)
        elif args['framework'] == 'pytorch':
            pass
        else:
            raise ValueError(f'Framework {framework} is not yet supported.')

        if model is None:
            continue  # sometimes keras fails to train a model (evaluates to nan)

        # Append runtime
        run_times.append(runtime)

        # Dump model
        if args['save_model']:
            joblib.dump(model,
                        filename=rout /
                        ('model.' + args['model_name'] + '.pkl'))

        # Calc preds and scores
        # ... training set
        y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype)
        tr_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_preds(y_true, y_pred, meta=mtr, outpath=rout / 'preds_tr.csv')
        # ... val set
        y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype)
        vl_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_preds(y_true, y_pred, meta=mvl, outpath=rout / 'preds_vl.csv')
        # ... test set
        y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype)
        te_scores = calc_scores(y_true=y_true,
                                y_pred=y_pred,
                                mltype=mltype,
                                metrics=None)
        dump_preds(y_true, y_pred, meta=mte, outpath=rout / 'preds_te.csv')

        # Add metadata
        tr_scores['run'] = split_id
        vl_scores['run'] = split_id
        te_scores['run'] = split_id

        # Append scores (dicts)
        tr_scores_all.append(tr_scores)
        vl_scores_all.append(vl_scores)
        te_scores_all.append(te_scores)

        # Free space
        # del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, tr_, vl_
        del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, eval_set, model, estimator

        if i % 10 == 0:
            print(f'Finished {split_id}')

    file_smp_sz.close()

    # Scores to df
    tr_scores_df = scores_to_df(tr_scores_all)
    vl_scores_df = scores_to_df(vl_scores_all)
    te_scores_df = scores_to_df(te_scores_all)

    tr_scores_df.to_csv(gout / 'tr_scores.csv', index=False)
    vl_scores_df.to_csv(gout / 'vl_scores.csv', index=False)
    te_scores_df.to_csv(gout / 'te_scores.csv', index=False)

    if (time() - t0) // 3600 > 0:
        lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))
    else:
        lg.logger.info('Runtime: {:.1f} min'.format((time() - t0) / 60))

    del tr_scores_df, vl_scores_df, te_scores_df

    # --------------------------------------------------------
    # Calc stats
    def reorg_cols(df, col_first: str):
        """
        Args:
            col_first : col name to put first
        """
        cols = df.columns.tolist()
        cols.remove(col_first)
        return df[[col_first] + cols]

    def agg_preds_from_cls_runs(runs_dirs, phase='_te.csv', verbose=False):
        """ Aggregate predictions bootstraped ML trainings. """
        prd = []
        for i, dir_name in enumerate(runs_dirs):
            if '_tr.csv' in phase:
                prd_ = pd.read_csv(dir_name / 'preds_tr.csv')
            elif '_vl.csv' in phase:
                prd_ = pd.read_csv(dir_name / 'preds_vl.csv')
            elif '_te.csv' in phase:
                prd_ = pd.read_csv(dir_name / 'preds_te.csv')

            # prd_te_['err'] = abs(prd_te_['y_true'] - prd_te_['y_pred'])      # add col 'err'
            prd_['run'] = str(dir_name).split(
                os.sep)[-1].split('_')[-1]  # add col 'run' identifier
            prd.append(prd_)  # append run data

            if verbose:
                if i % 20 == 0:
                    print(f'Processing {dir_name}')

        # Aggregate to df
        prd = pd.concat(prd, axis=0)

        # Reorganize cols
        prd = reorg_cols(prd, col_first='run').sort_values('run').reset_index(
            drop=True).reset_index().rename(columns={'index': 'idx'})
        return prd

    # Concat preds from all runs
    runs_dirs = [Path(p) for p in glob(str(gout / 'run_*'))]
    prd_te_all = agg_preds_from_cls_runs(runs_dirs, phase='_te.csv')
    if 'source' not in [str(i).lower() for i in prd_te_all.columns.to_list()]:
        prd_te_all.insert(
            loc=2,
            column='SOURCE',
            value=[s.split('.')[0].lower() for s in prd_te_all['CELL']])

    # Cancer types
    cancer_types = pd.read_csv(filepath / 'data/combined_cancer_types',
                               sep='\t',
                               names=['CELL', 'CTYPE'])

    # Add CTYPE columns
    prd_te_all = pd.merge(prd_te_all, cancer_types, on='CELL')
    prd_te_all = reorg_cols(prd_te_all, col_first='CTYPE')

    # Rename
    prd_te_all = prd_te_all.rename(columns={
        'y_true': 'y_true_cls',
        'y_pred': 'y_pred_prob'
    })

    # Retain specific columns
    cols = [
        'idx', 'run', 'SOURCE', 'CTYPE', 'CELL', 'DRUG', 'R2fit', 'AUC',
        'y_true_cls', 'y_pred_prob'
    ]
    prd_te_all = prd_te_all[cols]

    # Add col of pred labels
    prd_te_all['y_pred_cls'] = prd_te_all.y_pred_prob.map(lambda x: 0
                                                          if x < 0.5 else 1)

    # The highest error is 0.5 while the lowest is 0.
    # This value is proportional to the square root of Brier score.
    prd_te_all['prob_err'] = abs(prd_te_all.y_true_cls -
                                 prd_te_all.y_pred_prob)

    # Bin AUC values
    bins = np.arange(0, 1.1, 0.1).tolist()
    prd_te_all['AUC_bin'] = pd.cut(prd_te_all.AUC,
                                   bins,
                                   right=True,
                                   labels=None,
                                   retbins=False,
                                   precision=3,
                                   include_lowest=False,
                                   duplicates='raise')

    # Add col that cetegorizes the preds
    prd_te_all['prd_cat'] = None
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1)
                       & (prd_te_all.y_pred_cls == 1)] = 'TP'
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0)
                       & (prd_te_all.y_pred_cls == 0)] = 'TN'
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1)
                       & (prd_te_all.y_pred_cls == 0)] = 'FN'
    prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0)
                       & (prd_te_all.y_pred_cls == 1)] = 'FP'

    # Add cols
    prd_te_all['TP'] = prd_te_all.prd_cat == 'TP'
    prd_te_all['TN'] = prd_te_all.prd_cat == 'TN'
    prd_te_all['FP'] = prd_te_all.prd_cat == 'FP'
    prd_te_all['FN'] = prd_te_all.prd_cat == 'FN'

    # Save aggregated master table
    prd_te_all.to_csv('prd_te_all.csv', index=False)

    # Plot confusion matrix
    from sklearn.metrics import confusion_matrix
    # y_true_cls = prd_te_all.y_true_cls
    # y_pred_cls = prd_te_all.y_pred.map(lambda x: 0 if x<0.5 else 1)
    y_true_cls = prd_te_all.y_true_cls
    y_pred_cls = prd_te_all.y_pred_cls
    np_conf = confusion_matrix(y_true_cls, y_pred_cls)
    tn, fp, fn, tp = confusion_matrix(y_true_cls, y_pred_cls).ravel()

    mcc = sklearn.metrics.matthews_corrcoef(y_true_cls,
                                            y_pred_cls,
                                            sample_weight=None)
    print('TN:', tn)
    print('FP:', fp)
    print('FN:', fn)
    print('TP:', tp)
    print('FPR:', fp / (fp + tn))
    print('FNR:', fn / (fn + tp))
    print('MCC:', mcc)

    with open(gout / 'scores.txt', 'w') as f:
        f.write('TN: {:d}\n'.format(tn))
        f.write('TN: {:d}\n'.format(tn))
        f.write('FP: {:d}\n'.format(fp))
        f.write('FN: {:d}\n'.format(fn))
        f.write('TP: {:d}\n'.format(tp))
        f.write('FPR: {:.5f}\n'.format(fp / (fp + tn)))
        f.write('FNR: {:.5f}\n'.format(fn / (fn + tp)))
        f.write('MCC: {:.5f}\n'.format(mcc))

    # Confusion Matrix
    conf = confusion_matrix(y_true_cls, y_pred_cls, normalize=None)
    conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp'])
    conf_plot.plot(include_values=True,
                   cmap=plt.cm.Blues,
                   ax=None,
                   xticks_rotation=None,
                   values_format='d')
    plt.savefig(gout / 'conf_mat.png', dpi=100)

    # Confusion Matrix (normalized)
    conf = confusion_matrix(y_true_cls, y_pred_cls, normalize='all')
    conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp'])
    conf_plot.plot(include_values=True,
                   cmap=plt.cm.Blues,
                   ax=None,
                   xticks_rotation=None,
                   values_format='.2f')
    conf_plot.ax_.set_title('Normalized')
    plt.savefig(gout / 'conf_mat_norm.png', dpi=100)

    def add_conf_data(data):
        """ Add columns are used to calc confusion matrix TP, TN, FN, FP. """
        data['TP'] = data.apply(lambda row: row.y_pred_cls_1
                                if row.y_true == 1 else False,
                                axis=1)  # tp
        data['TN'] = data.apply(lambda row: row.y_pred_cls_0
                                if row.y_true == 0 else False,
                                axis=1)  # tn
        data['FN'] = data.apply(lambda row: row.y_pred_cls_0
                                if row.y_true == 1 else False,
                                axis=1)  # fn
        data['FP'] = data.apply(lambda row: row.y_pred_cls_1
                                if row.y_true == 0 else False,
                                axis=1)  # fp

        data['TPR'] = data.apply(
            lambda row: np.nan
            if (row.TP == 0) & (row.FN == 0) else row.TP / (row.TP + row.FN),
            axis=1)  # sensitivity, recall: TP/P = TP/(TP+FN)
        data['TNR'] = data.apply(lambda row: np.nan
                                 if (row.TN == 0) & (row.FP == 0) else row.TN /
                                 (row.TN + row.FP),
                                 axis=1)  # specificity: TN/N = TN/(TN+FP)

        data['FPR'] = data.apply(lambda row: np.nan
                                 if (row.TN == 0) & (row.FP == 0) else row.FP /
                                 (row.TN + row.FP),
                                 axis=1)  # fall-out: FP/N = FP/(FP+TN)
        data['FNR'] = data.apply(lambda row: np.nan
                                 if (row.TP == 0) & (row.FN == 0) else row.FN /
                                 (row.TP + row.FN),
                                 axis=1)  # miss-rate: FN/NP = FN/(FN+TP)
        return data

    # Summary table
    prd_te_to_grp = prd_te_all.copy()
    prd_te_to_grp['y_pred_prob_median'] = prd_te_to_grp.y_pred_prob
    prd_te_to_grp['y_pred_prob_std'] = prd_te_to_grp.y_pred_prob
    prd_te_to_grp['y_pred_tot'] = prd_te_to_grp.idx
    prd_te_to_grp['y_pred_cls_0'] = prd_te_to_grp.y_pred.map(
        lambda x: True if x < 0.5 else False)
    prd_te_to_grp['y_pred_cls_1'] = prd_te_to_grp.y_pred.map(
        lambda x: True if x >= 0.5 else False)
    prd_te_to_grp['y_true_unq_vals'] = prd_te_to_grp.y_true_cls

    # -----------------------
    # Groupby Cell
    # -----------------------
    by = 'CELL'
    sm_cell = prd_te_to_grp.groupby([by, 'y_true']).agg({
        'DRUG':
        'unique',
        'CTYPE':
        'unique',
        'y_true_unq_vals':
        'unique',
        'y_pred_prob_median':
        np.median,
        'y_pred_prob_std':
        np.std,
        'y_pred_cls_0':
        lambda x: int(sum(x)),
        'y_pred_cls_1':
        lambda x: int(sum(x)),
        'y_pred_tot':
        lambda x: len(np.unique(x)),
    }).reset_index().sort_values(by, ascending=True)

    sm_cell['y_true_unq_vals'] = sm_cell.y_true_unq_vals.map(
        lambda x: len(x) if type(x) == np.ndarray else 1)
    sm_cell = add_conf_data(sm_cell)
    sm_cell.to_csv(gout / 'sm_by_cell.csv', index=False)

    # -----------------------
    # Groupby Cancer Type
    # -----------------------
    by = 'CTYPE'
    sm_ctype = prd_te_to_grp.groupby([by, 'y_true']).agg({
        'DRUG':
        'unique',
        'CELL':
        'unique',
        'y_true_unq_vals':
        'unique',
        'y_pred_prob_median':
        np.median,
        'y_pred_prob_std':
        np.std,
        'y_pred_cls_0':
        lambda x: int(sum(x)),
        'y_pred_cls_1':
        lambda x: int(sum(x)),
        'y_pred_tot':
        lambda x: len(np.unique(x)),
    }).reset_index().sort_values(by, ascending=True)

    sm_ctype['y_true_unq_vals'] = sm_ctype.y_true_unq_vals.map(
        lambda x: len(x) if type(x) == np.ndarray else 1)
    sm_ctype = add_conf_data(sm_ctype)
    sm_ctype.to_csv(gout / 'sm_by_ctype.csv', index=False)

    # -----------------------
    # Groupby Drug
    # -----------------------
    by = 'DRUG'
    sm_drug = prd_te_to_grp.groupby([by, 'y_true']).agg({
        'CTYPE':
        'unique',
        'CELL':
        'unique',
        'y_true_unq_vals':
        'unique',
        'y_pred_prob_median':
        np.median,
        'y_pred_prob_std':
        np.std,
        'y_pred_cls_0':
        lambda x: int(sum(x)),
        'y_pred_cls_1':
        lambda x: int(sum(x)),
        'y_pred_tot':
        lambda x: len(np.unique(x)),
    }).reset_index().sort_values(by, ascending=True)

    sm_drug['y_true_unq_vals'] = sm_drug.y_true_unq_vals.map(
        lambda x: len(x) if type(x) == np.ndarray else 1)
    sm_drug = add_conf_data(sm_drug)
    sm_drug.to_csv(gout / 'sm_by_drug.csv', index=False)

    # --------------------------------------------------------
    lg.kill_logger()
Example #4
0
}

prfx_dtypes = {
    'rna': np.float32,
    'cnv': np.int8,
    'snp': np.int8,
    'dsc': np.float32,
    'fng': np.int8
}

# -----------------------------------------------
#     Create outdir and logger
# -----------------------------------------------
outdir = create_outdir(OUTDIR, args)
args['outdir'] = str(outdir)
lg = Logger(outdir / 'create_tidy_logfile.log')
lg.logger.info(f'File path: {filepath}')
lg.logger.info(f'\n{pformat(args)}')
dump_dict(args, outpath=outdir / 'create_tidy_args.txt')  # dump args

# -----------------------------------------------
#     Load response data, and features
# -----------------------------------------------
rsp = load_rsp(RSP_FILENAME, logger=lg.logger, args=args)
rna = load_rna(rna_norm=args['rna_norm'],
               logger=lg.logger,
               float_type=prfx_dtypes['rna'])
dsc = load_dsc(DSC_FILENAME, logger=lg.logger, float_type=prfx_dtypes['dsc'])

# -----------------------------------------------
#     Load cell and drug meta
def run(args):
    dirpath = Path(args['dirpath'])
    # dname = args['dname']
    # src_names = args['src_names']

    # Target
    target_name = args['target_name']

    # Data split
    cv_folds = args['cv_folds']

    # Features
    cell_fea = args['cell_features']
    drug_fea = args['drug_features']
    other_fea = args['other_features']
    fea_list = cell_fea + drug_fea + other_fea

    # NN params
    epochs = args['epochs']
    batch_size = args['batch_size']
    dr_rate = args['dr_rate']

    # Optimizer
    opt_name = args['opt']
    clr_keras_kwargs = {
        'mode': args['clr_mode'],
        'base_lr': args['clr_base_lr'],
        'max_lr': args['clr_max_lr'],
        'gamma': args['clr_gamma']
    }

    # Learning curve
    n_shards = args['n_shards']

    # Other params
    # framework = args['framework']
    model_name = args['model_name']
    n_jobs = args['n_jobs']

    # ML type ('reg' or 'cls')
    if 'reg' in model_name:
        mltype = 'reg'
    elif 'cls' in model_name:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    # Define metrics
    # metrics = {'r2': 'r2',
    #            'neg_mean_absolute_error': 'neg_mean_absolute_error', #sklearn.metrics.neg_mean_absolute_error,
    #            'neg_median_absolute_error': 'neg_median_absolute_error', #sklearn.metrics.neg_median_absolute_error,
    #            'neg_mean_squared_error': 'neg_mean_squared_error', #sklearn.metrics.neg_mean_squared_error,
    #            'reg_auroc_score': utils.reg_auroc_score}

    # ========================================================================
    #       Load data and pre-proc
    # ========================================================================
    dfs = {}

    def get_file(fpath):
        return pd.read_csv(
            fpath, header=None).squeeze().values if fpath.is_file() else None

    def read_data_file(fpath, file_format='csv'):
        fpath = Path(fpath)
        if fpath.is_file():
            if file_format == 'csv':
                df = pd.read_csv(fpath)
            elif file_format == 'parquet':
                df = pd.read_parquet(fpath)
        else:
            df = None
        return df

    if dirpath is not None:
        xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet')
        meta = read_data_file(dirpath / 'meta.parquet', 'parquet')
        ydata = meta[[target_name]]

        tr_id = pd.read_csv(dirpath / f'{cv_folds}fold_tr_id.csv')
        vl_id = pd.read_csv(dirpath / f'{cv_folds}fold_vl_id.csv')

        # tr_ids_list = get_file( dirpath/f'{cv_folds}fold_tr_id.csv' )
        # vl_ids_list = get_file( dirpath/f'{cv_folds}fold_vl_id.csv' )
        # te_ids_list = get_file( dirpath/f'{cv_folds}fold_te_id.csv' )

        src = dirpath.name.split('_')[0]
        dfs[src] = (ydata, xdata, tr_id, vl_id)

    elif dname == 'combined':
        # TODO: this is not used anymore (probably won't work)
        DATADIR = file_path / '../../data/processed/data_splits'
        DATAFILENAME = 'data.parquet'
        dirs = glob(str(DATADIR / '*'))

        for src in src_names:
            print(f'\n{src} ...')
            subdir = f'{src}_cv_{cv_method}'
            if str(DATADIR / subdir) in dirs:
                # Get the CV indexes
                tr_id = pd.read_csv(DATADIR / subdir /
                                    f'{cv_folds}fold_tr_id.csv')
                vl_id = pd.read_csv(DATADIR / subdir /
                                    f'{cv_folds}fold_vl_id.csv')

                # Get the data
                datapath = DATADIR / subdir / DATAFILENAME
                data = pd.read_parquet(datapath)
                xdata, _, meta, _ = break_src_data(
                    data, target=None, scaler=None)  # logger=lg.logger
                ydata = meta[[target_name]]

                dfs[src] = (ydata, xdata, tr_id, vl_id)
                del data, xdata, ydata, tr_id, vl_id, src

    for src, data in dfs.items():
        ydata, xdata, tr_id, vl_id = data[0], data[1], data[2], data[3]

        # Scale
        scaler = args['scaler']
        if scaler is not None:
            if scaler == 'stnd':
                scaler = StandardScaler()
            elif scaler == 'minmax':
                scaler = MinMaxScaler()
            elif scaler == 'rbst':
                scaler = RobustScaler()

        cols = xdata.columns
        xdata = pd.DataFrame(scaler.fit_transform(xdata),
                             columns=cols,
                             dtype=np.float32)

        # -----------------------------------------------
        #       Create outdir and logger
        # -----------------------------------------------
        run_outdir = create_outdir(OUTDIR, args, src)
        lg = Logger(run_outdir / 'logfile.log')
        lg.logger.info(f'File path: {file_path}')
        lg.logger.info(f'\n{pformat(args)}')

        # Dump args to file
        utils.dump_dict(args, outpath=run_outdir / 'args.txt')

        # -----------------------------------------------
        #      ML model configs
        # -----------------------------------------------
        if model_name == 'lgb_reg':
            framework = 'lightgbm'
            init_kwargs = {
                'n_jobs': n_jobs,
                'random_state': SEED,
                'logger': lg.logger
            }
            fit_kwargs = {'verbose': False}
        elif model_name == 'nn_reg':
            framework = 'keras'
            init_kwargs = {
                'input_dim': xdata.shape[1],
                'dr_rate': dr_rate,
                'opt_name': opt_name,
                'attn': attn,
                'logger': lg.logger
            }
            fit_kwargs = {
                'batch_size': batch_size,
                'epochs': epochs,
                'verbose': 1
            }
        elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg2':
            framework = 'keras'
            init_kwargs = {
                'input_dim': xdata.shape[1],
                'dr_rate': dr_rate,
                'opt_name': opt_name,
                'logger': lg.logger
            }
            fit_kwargs = {
                'batch_size': batch_size,
                'epochs': epochs,
                'verbose': 1
            }  # 'validation_split': 0.1
        elif model_name == 'nn_reg3' or 'nn_reg4':
            framework = 'keras'
            init_kwargs = {
                'in_dim_rna': None,
                'in_dim_dsc': None,
                'dr_rate': dr_rate,
                'opt_name': opt_name,
                'logger': lg.logger
            }
            fit_kwargs = {
                'batch_size': batch_size,
                'epochs': epochs,
                'verbose': 1
            }  # 'validation_split': 0.1

        # -----------------------------------------------
        #      Learning curve
        # -----------------------------------------------
        lg.logger.info('\n\n{}'.format('=' * 50))
        lg.logger.info(f'Learning curves {src} ...')
        lg.logger.info('=' * 50)

        t0 = time()
        lc = LearningCurve(X=xdata,
                           Y=ydata,
                           cv=None,
                           cv_lists=(tr_id, vl_id),
                           n_shards=n_shards,
                           shard_step_scale='log10',
                           args=args,
                           logger=lg.logger,
                           outdir=run_outdir)

        lrn_crv_scores = lc.trn_learning_curve(
            framework=framework,
            mltype=mltype,
            model_name=model_name,
            init_kwargs=init_kwargs,
            fit_kwargs=fit_kwargs,
            clr_keras_kwargs=clr_keras_kwargs,
            n_jobs=n_jobs,
            random_state=SEED)

        lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 360))

        # -------------------------------------------------
        # Learning curve (sklearn method)
        # Problem! cannot log multiple metrics.
        # -------------------------------------------------
        """
        lg.logger.info('\nStart learning curve (sklearn method) ...')
        # Define params
        metric_name = 'neg_mean_absolute_error'
        base = 10
        train_sizes_frac = np.logspace(0.0, 1.0, lc_ticks, endpoint=True, base=base)/base

        # Run learning curve
        t0 = time()
        lrn_curve_scores = learning_curve(
            estimator=model.model, X=xdata, y=ydata,
            train_sizes=train_sizes_frac, cv=cv, groups=groups,
            scoring=metric_name,
            n_jobs=n_jobs, exploit_incremental_learning=False,
            random_state=SEED, verbose=1, shuffle=False)
        lg.logger.info('Runtime: {:.1f} mins'.format( (time()-t0)/60) )

        # Dump results
        # lrn_curve_scores = utils.cv_scores_to_df(lrn_curve_scores, decimals=3, calc_stats=False) # this func won't work
        # lrn_curve_scores.to_csv(os.path.join(run_outdir, 'lrn_curve_scores_auto.csv'), index=False)

        # Plot learning curves
        lrn_crv.plt_learning_curve(rslt=lrn_curve_scores, metric_name=metric_name,
            title='Learning curve (target: {}, data: {})'.format(target_name, tr_sources_name),
            path=os.path.join(run_outdir, 'auto_learning_curve_' + target_name + '_' + metric_name + '.png'))
        """

        lg.kill_logger()
        del xdata, ydata

    print('Done.')
Example #6
0
def run(args):
    dirpath = Path(args['dirpath'])
    assert dirpath.exists(), 'You must specify the dirpath.'

    target_name = args['target_name']
    cv_folds = args['cv_folds']
    cv_folds_arr = args['cv_folds_arr']

    # Features
    cell_fea = args['cell_fea']
    drug_fea = args['drug_fea']
    # other_fea = args['other_fea']
    # fea_list = cell_fea + drug_fea + other_fea
    fea_list = cell_fea + drug_fea

    # NN params
    epochs = args['epochs']
    batch_size = args['batch_size']
    dr_rate = args['dr_rate']
    batchnorm = args['batchnorm']

    # Optimizer
    opt_name = args['opt']
    lr = args['lr']
    clr_keras_kwargs = {
        'mode': args['clr_mode'],
        'base_lr': args['clr_base_lr'],
        'max_lr': args['clr_max_lr'],
        'gamma': args['clr_gamma']
    }

    # Learning curve
    shard_step_scale = args['shard_step_scale']
    min_shard = args['min_shard']
    max_shard = args['max_shard']
    n_shards = args['n_shards']
    shards_arr = args['shards_arr']

    # Other params
    # framework = args['framework']
    model_name = args['model_name']
    n_jobs = args['n_jobs']

    # ML type ('reg' or 'cls')
    if 'reg' in model_name:
        mltype = 'reg'
    elif 'cls' in model_name:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    # Define metrics
    # metrics = {'r2': 'r2',
    #            'neg_mean_absolute_error': 'neg_mean_absolute_error', #sklearn.metrics.neg_mean_absolute_error,
    #            'neg_median_absolute_error': 'neg_median_absolute_error', #sklearn.metrics.neg_median_absolute_error,
    #            'neg_mean_squared_error': 'neg_mean_squared_error', #sklearn.metrics.neg_mean_squared_error,
    #            'reg_auroc_score': utils.reg_auroc_score}

    # -----------------------------------------------
    #       Load data and pre-proc
    # -----------------------------------------------
    xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet')
    meta = read_data_file(dirpath / 'meta.parquet', 'parquet')
    ydata = meta[[target_name]]

    tr_id = read_data_file(dirpath / f'{cv_folds}fold_tr_id.csv')
    vl_id = read_data_file(dirpath / f'{cv_folds}fold_vl_id.csv')
    te_id = read_data_file(dirpath / f'{cv_folds}fold_te_id.csv')

    src = dirpath.name.split('_')[0]

    # -----------------------------------------------
    #       Create outdir and logger
    # -----------------------------------------------
    outdir = Path(str(dirpath).split('_')[0] + '_trn')
    run_outdir = create_outdir(outdir, args, src)
    lg = Logger(run_outdir / 'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    # Dump args to file
    dump_dict(args, outpath=run_outdir / 'args.txt')

    # -----------------------------------------------
    #       Data preprocessing
    # -----------------------------------------------
    xdata = scale_fea(xdata=xdata,
                      scaler_name=args['scaler'])  # scale features

    # -----------------------------------------------
    #      ML model configs
    # -----------------------------------------------
    if model_name == 'lgb_reg':
        framework = 'lightgbm'
        init_kwargs = {
            'n_estimators': args['gbm_trees'],
            'max_depth': args['gbm_max_depth'],
            'learning_rate': args['gbm_lr'],
            'num_leaves': args['gbm_leaves'],
            'n_jobs': args['n_jobs'],
            'random_state': args['seed']
        }
        fit_kwargs = {'verbose': False}
    elif model_name == 'rf_reg':
        framework = 'sklearn'
        init_kwargs = {'n_jobs': args['n_jobs'], 'random_state': args['seed']}
        fit_kwargs = {}
    elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg_layer_less' or 'nn_reg_layer_more' or 'nn_reg_neuron_less' or 'nn_reg_neuron_more':
        framework = 'keras'
        init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': dr_rate,
            'opt_name': opt_name,
            'lr': lr,
            'batchnorm': batchnorm,
            'logger': lg.logger
        }
        fit_kwargs = {
            'batch_size': batch_size,
            'epochs': epochs,
            'verbose': 1
        }  # 'validation_split': 0.1

    # -----------------------------------------------
    #      Learning curve
    # -----------------------------------------------
    lg.logger.info('\n\n{}'.format('-' * 50))
    lg.logger.info(f'Learning curves {src} ...')
    lg.logger.info('-' * 50)

    lrn_crv_init_kwargs = {
        'cv': None,
        'cv_lists': (tr_id, vl_id, te_id),
        'cv_folds_arr': cv_folds_arr,
        'shard_step_scale': shard_step_scale,
        'n_shards': n_shards,
        'min_shard': min_shard,
        'max_shard': max_shard,
        'shards_arr': shards_arr,
        'args': args,
        'logger': lg.logger,
        'outdir': run_outdir
    }

    lrn_crv_trn_kwargs = {
        'framework': framework,
        'mltype': mltype,
        'model_name': model_name,
        'init_kwargs': init_kwargs,
        'fit_kwargs': fit_kwargs,
        'clr_keras_kwargs': clr_keras_kwargs,
        'n_jobs': n_jobs,
        'random_state': args['seed']
    }

    t0 = time()
    lc = LearningCurve(X=xdata, Y=ydata, **lrn_crv_init_kwargs)
    lrn_crv_scores = lc.trn_learning_curve(**lrn_crv_trn_kwargs)
    lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))

    # -------------------------------------------------
    # Learning curve (sklearn method)
    # Problem! cannot log multiple metrics.
    # -------------------------------------------------
    """
    lg.logger.info('\nStart learning curve (sklearn method) ...')
    # Define params
    metric_name = 'neg_mean_absolute_error'
    base = 10
    train_sizes_frac = np.logspace(0.0, 1.0, lc_ticks, endpoint=True, base=base)/base

    # Run learning curve
    t0 = time()
    lrn_curve_scores = learning_curve(
        estimator=model.model, X=xdata, y=ydata,
        train_sizes=train_sizes_frac, cv=cv, groups=groups,
        scoring=metric_name,
        n_jobs=n_jobs, exploit_incremental_learning=False,
        random_state=SEED, verbose=1, shuffle=False)
    lg.logger.info('Runtime: {:.1f} mins'.format( (time()-t0)/60) )

    # Dump results
    # lrn_curve_scores = utils.cv_scores_to_df(lrn_curve_scores, decimals=3, calc_stats=False) # this func won't work
    # lrn_curve_scores.to_csv(os.path.join(run_outdir, 'lrn_curve_scores_auto.csv'), index=False)

    # Plot learning curves
    lrn_crv.plt_learning_curve(rslt=lrn_curve_scores, metric_name=metric_name,
        title='Learning curve (target: {}, data: {})'.format(target_name, tr_sources_name),
        path=os.path.join(run_outdir, 'auto_learning_curve_' + target_name + '_' + metric_name + '.png'))
    """

    lg.kill_logger()
    del xdata, ydata

    print('Done.')
Example #7
0
def make_split(xdata, meta, outdir, args):
    # Data splits
    te_method = args['te_method']
    cv_method = args['cv_method']
    te_size = split_size(args['te_size'])
    vl_size = split_size(args['vl_size'])

    # Features
    cell_fea = args['cell_fea']
    drug_fea = args['drug_fea']
    # fea_list = cell_fea + drug_fea

    # Other params
    n_jobs = args['n_jobs']

    # Hard split
    grp_by_col = None
    # cv_method = 'simple'

    # TODO: this need to be improved
    mltype = 'reg'  # required for the splits (stratify in case of classification)

    # -----------------------------------------------
    #       Outdir and Logger
    # -----------------------------------------------
    # Logger
    lg = Logger(outdir / 'splitter.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    # Dump args to file
    dump_dict(args, outpath=outdir / 'args.txt')

    # -----------------------------------------------
    #       Load data and pre-proc
    # -----------------------------------------------
    if (outdir / 'xdata.parquet').is_file():
        xdata = pd.read_parquet(outdir / 'xdata.parquet')
        meta = pd.read_parquet(outdir / 'meta.parquet')

    # -----------------------------------------------
    #       Train-test split
    # -----------------------------------------------
    np.random.seed(SEED)
    idx_vec = np.random.permutation(xdata.shape[0])

    if te_method is not None:
        lg.logger.info('\nSplit train/test.')
        te_splitter = cv_splitter(cv_method=te_method,
                                  cv_folds=1,
                                  test_size=te_size,
                                  mltype=mltype,
                                  shuffle=False,
                                  random_state=SEED)

        te_grp = meta[grp_by_col].values[
            idx_vec] if te_method == 'group' else None
        if is_string_dtype(te_grp):
            te_grp = LabelEncoder().fit_transform(te_grp)

        # Split train/test
        tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
        tr_id = idx_vec[tr_id]  # adjust the indices!
        te_id = idx_vec[te_id]  # adjust the indices!

        pd.Series(tr_id).to_csv(outdir / f'tr_id.csv', index=False, header=[0])
        pd.Series(te_id).to_csv(outdir / f'te_id.csv', index=False, header=[0])

        lg.logger.info('Train: {:.1f}'.format(len(tr_id) / xdata.shape[0]))
        lg.logger.info('Test:  {:.1f}'.format(len(te_id) / xdata.shape[0]))

        # Update the master idx vector for the CV splits
        idx_vec = tr_id

        # Plot dist of responses (TODO: this can be done to all response metrics)
        # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values,
        #         title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png')

        # Confirm that group splits are correct
        if te_method == 'group' and grp_by_col is not None:
            tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
            te_grp_unq = set(meta.loc[te_id, grp_by_col])
            lg.logger.info(
                f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.'
            )
            lg.logger.info(
                f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.'
            )

        # Update vl_size to effective vl_size
        vl_size = vl_size * xdata.shape[0] / len(tr_id)

        del tr_id, te_id

    # -----------------------------------------------
    #       Generate CV splits
    # -----------------------------------------------
    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        cv = cv_splitter(cv_method=cv_method,
                         cv_folds=cv_folds,
                         test_size=vl_size,
                         mltype=mltype,
                         shuffle=False,
                         random_state=SEED)

        cv_grp = meta[grp_by_col].values[
            idx_vec] if cv_method == 'group' else None
        if is_string_dtype(cv_grp):
            cv_grp = LabelEncoder().fit_transform(cv_grp)

        tr_folds = {}
        vl_folds = {}

        # Start CV iters
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec,
                                                       groups=cv_grp)):
            tr_id = idx_vec[tr_id]  # adjust the indices!
            vl_id = idx_vec[vl_id]  # adjust the indices!

            tr_folds[fold] = tr_id.tolist()
            vl_folds[fold] = vl_id.tolist()

            # Confirm that group splits are correct
            if cv_method == 'group' and grp_by_col is not None:
                tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
                vl_grp_unq = set(meta.loc[vl_id, grp_by_col])
                lg.logger.info(
                    f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.'
                )
                lg.logger.info(
                    f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(
                    f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in tr_folds.items()]))
        vl_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in vl_folds.items()]))

        # Dump
        tr_folds.to_csv(outdir / f'{cv_folds}fold_tr_id.csv', index=False)
        vl_folds.to_csv(outdir / f'{cv_folds}fold_vl_id.csv', index=False)

    lg.kill_logger()
Example #8
0
def build_dataframe(args):
    na_values = ['na', '-', '']  # (ap)

    # (ap) Create outdir and logger
    import os
    # outdir = Path('top' + str(args.top_n) + sffx + '_data')
    sffx = '' if args.src is None else '_'.join(args.src)
    if args.top_n < 200:
        outdir = Path('top' + str(args.top_n) + sffx)
    else:
        outdir = Path(sffx)
    os.makedirs(outdir, exist_ok=True)
    lg = Logger(outdir / 'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')
    dump_dict(vars(args), outpath=outdir / 'args.txt')

    # Identify Top N cancer types
    df_response = pd.read_csv(response_path,
                              sep='\t',
                              engine='c',
                              low_memory=False,
                              na_values=na_values,
                              warn_bad_lines=True)
    lg.logger.info(
        df_response.groupby('SOURCE').agg({
            'CELL': 'nunique',
            'DRUG': 'nunique'
        }).reset_index())  # (ap)

    # (ap) Extract specific data sources
    df_response['SOURCE'] = df_response['SOURCE'].apply(
        lambda x: x.lower())  # (ap)
    if args.src is not None:
        df_response = df_response[df_response['SOURCE'].isin(
            args.src)].reset_index(drop=True)

    df_uniq_cl_drugs = df_response[['CELL', 'DRUG'
                                    ]].drop_duplicates().reset_index(drop=True)

    df_cl_cancer_map = pd.read_csv(cell_cancer_types_map_path,
                                   sep='\t',
                                   header=None,
                                   names=['CELL', 'CANCER_TYPE'])
    df_cl_cancer_map.set_index('CELL')

    df_cl_cancer_drug = df_cl_cancer_map.merge(df_uniq_cl_drugs,
                                               on='CELL',
                                               how='left',
                                               sort='true')
    df_cl_cancer_drug['CELL_DRUG'] = df_cl_cancer_drug.CELL.astype(
        str) + '.' + df_cl_cancer_drug.DRUG.astype(str)

    top_n = df_cl_cancer_drug.groupby(['CANCER_TYPE']).count().sort_values(
        'CELL_DRUG', ascending=False).head(args.top_n)
    top_n_cancer_types = top_n.index.to_list()

    lg.logger.info("Identified {} cancer types: {}".format(
        args.top_n, top_n_cancer_types))

    # Indentify cell lines associated with the target cancer types
    df_cl = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin(
        top_n_cancer_types)][['CELL']].drop_duplicates().reset_index(drop=True)

    # Identify drugs associated with the target cancer type & filtered by drug_list
    df_drugs = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin(
        top_n_cancer_types)][['DRUG']].drop_duplicates().reset_index(drop=True)

    drug_list = pd.read_csv(drug_list_path)['DRUG'].to_list()
    df_drugs = df_drugs[df_drugs['DRUG'].isin(drug_list)].reset_index(
        drop=True)

    # Filter response by cell lines (4882) and drugs (1779)
    cl_filter = df_cl.CELL.to_list()
    dr_filter = df_drugs.DRUG.to_list()
    target = args.target

    # df_response = df_response[df_response.CELL.isin(cl_filter) & df_response.DRUG.isin(dr_filter)][['CELL', 'DRUG', target]].drop_duplicates().reset_index(drop=True) # (ap) commented
    idx = df_response.CELL.isin(cl_filter) & df_response.DRUG.isin(dr_filter)
    df_response = df_response[idx].drop_duplicates().reset_index(
        drop=True)  # (ap) keep all targets

    # (ap) Drop bad points (these identified by Yitan)
    # TODO: confirm this with Yitan!
    """
    lg.logger.info('\nDrop bad samples ...')
    id_drop = (df_response['AUC'] == 0) & (df_response['EC50se'] == 0) & (df_response['R2fit'] == 0)
    df_response = df_response.loc[~id_drop,:]
    lg.logger.info(f'Dropped {sum(id_drop)} rsp data points.')
    lg.logger.info(f'df_response.shape {df_response.shape}')
    """

    # (ap) Drop points with bad fit
    # TODO: check this (may require a more rigorous analysis)
    lg.logger.info('\nDrop samples with bad fit (R2fit) ...')
    lg.logger.info(f'df_response.shape {df_response.shape}')
    id_drop = df_response['R2fit'] <= 0
    df_response = df_response.loc[~id_drop, :]
    lg.logger.info(f'Dropped {sum(id_drop)} rsp data points.')
    lg.logger.info(f'df_response.shape {df_response.shape}')

    if args.response_type == 'bin':
        df_response[target] = df_response[target].apply(lambda x: 0
                                                        if x < 0.5 else 1)
        df_response.rename(columns={target: 'Response'}, inplace=True)

    # ----------------
    # Load RNA-Seq
    # ----------------
    # Join response data with Drug descriptor & RNASeq
    df_rnaseq = pd.read_csv(get_cell_feature_path(args),
                            sep='\t',
                            low_memory=False,
                            na_values=na_values,
                            warn_bad_lines=True)
    df_rnaseq = df_rnaseq[df_rnaseq['Sample'].isin(cl_filter)].reset_index(
        drop=True)

    df_rnaseq.rename(columns={'Sample': 'CELL'}, inplace=True)
    df_rnaseq.columns = [
        'GE_' + x if i > 0 else x
        for i, x in enumerate(df_rnaseq.columns.to_list())
    ]
    df_rnaseq = df_rnaseq.set_index(['CELL'])

    # ----------------
    # Load descriptors
    # ----------------
    df_descriptor = pd.read_csv(get_drug_descriptor_path(args),
                                sep='\t',
                                low_memory=False,
                                na_values=na_values,
                                warn_bad_lines=True)
    # df_descriptor = df_descriptor[df_descriptor.DRUG.isin(dr_filter)].set_index(['DRUG']).fillna(0) # (ap) commented --> bad imputation!
    df_descriptor = df_descriptor[df_descriptor.DRUG.isin(
        dr_filter)].set_index(['DRUG'])  # (ap) added --> drop data imputation!

    # (ap) Some features have too many NA values (drop these)
    lg.logger.info('\nDrop cols with too many NA values ...')
    lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}')
    df_descriptor = dropna(df=df_descriptor, axis=1, th=0.5)
    lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}')

    # (ap) Impute missing values
    # There are descriptors for which there is a single unique value excluding NA (drop these)
    lg.logger.info(
        '\nDrop cols that have a single unique value (excluding NAs) ...')
    lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}')
    col_idx = df_descriptor.nunique(dropna=True).values == 1
    df_descriptor = df_descriptor.iloc[:, ~col_idx]
    lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}')

    # (ap) Impute missing values (drug descriptors)
    lg.logger.info('\nImpute NA values ...')
    df_descriptor = impute_values(df_descriptor, logger=None)

    # (ap)
    # There are still lots of descriptors which have only a few unique values.
    # We can categorize those values. e.g.: 564 descriptors have only 2 unique vals,
    # and 154 descriptors have only 3 unique vals, etc.
    # todo: use utility code from p1h_alex/utils/data_preproc.py that transform those
    # features into categorical and also applies an appropriate imputation.
    # df_descriptor.nunique(dropna=True).value_counts()[:10]
    # df_descriptor.nunique(dropna=True).value_counts().sort_index()[:10]

    df = df_response.merge(df_rnaseq, on='CELL', how='left', sort='true')
    df.set_index(
        ['DRUG'])  # TODO: this doesn't take effect unless performed 'inplace'

    df_final = df.merge(df_descriptor, on='DRUG', how='left', sort='true')
    if args.labels:
        df_cell_map = df_final['CELL'].to_dict()
        df_drug_map = df_final['DRUG'].to_dict()
        df_final.drop(columns=['CELL', 'DRUG'], inplace=True)
        df_final.drop_duplicates(inplace=True)
        df_final.insert(0, 'DRUG', df_final.index.map(df_drug_map))
        df_final.insert(0, 'CELL', df_final.index.map(df_cell_map))
        df_final.reset_index(drop=True, inplace=True)
    else:
        df_final.drop(columns=['CELL', 'DRUG'], inplace=True)
        df_final.drop_duplicates(inplace=True)
    lg.logger.info("\nDataframe is built with total {} rows.".format(
        len(df_final)))

    # (ap) Shuffle
    # lg.logger.info("Shuffle final df.")
    # df_final = df_final.sample(frac=1.0, random_state=args.seed).reset_index(drop=True)
    lg.logger.info(
        df_final.groupby('SOURCE').agg({
            'CELL': 'nunique',
            'DRUG': 'nunique'
        }).reset_index())  # (ap)

    save_filename = build_filename(args)
    # print("Saving to {}".format(save_filename)) # (ap) remove
    save_filename = outdir / save_filename  # (ap) added

    if args.format == 'feather':
        df_final.to_feather(save_filename)
    elif args.format == 'csv':
        df_final.to_csv(str(save_filename) + '.csv',
                        float_format='%g',
                        index=False)
    elif args.format == 'tsv':
        df_final.to_csv(save_filename,
                        sep='\t',
                        float_format='%g',
                        index=False)
    elif args.format == 'parquet':
        df_final.to_parquet(str(save_filename) + '.parquet', index=False)
    elif args.format == 'hdf5':
        df_cl.to_csv(build_file_basename(args) + '_cellline.txt',
                     header=False,
                     index=False)
        df_drugs.to_csv(build_file_basename(args) + '_drug.txt',
                        header=False,
                        index=False)
        df_final.to_hdf(save_filename,
                        key='df',
                        mode='w',
                        complib='blosc:snappy',
                        complevel=9)

    # --------------------------------------------------
    # (ap) tissue type histogram
    # --------------------------------------------------
    def plot_tissue_hist(top_n):
        dd = df_cl_cancer_drug[['CELL', 'DRUG', 'CANCER_TYPE'
                                ]].merge(df_final[['CELL', 'DRUG', 'AUC']],
                                         on=['CELL', 'DRUG'],
                                         how='inner')
        dd = pd.DataFrame(dd['CANCER_TYPE'].value_counts())
        dd = dd.reset_index().rename(columns={
            'index': 'ctype',
            'CANCER_TYPE': 'count'
        })
        dd['ctype'] = dd['ctype'].map(lambda x: ' '.join(x.split('_')))

        x = dd['ctype']
        y = dd['count']
        ax = dd.plot.barh(x='ctype',
                          y='count',
                          xlim=[0, y.max() * 1.15],
                          legend=False,
                          figsize=(9, 7),
                          fontsize=12)
        ax.set_ylabel(None, fontsize=14)
        ax.set_xlabel('Total responses', fontsize=14)
        ax.set_title(
            'Number of AUC responses per cancer type ({})'.format(top_n),
            fontsize=14)
        ax.invert_yaxis()

        for p in ax.patches:
            val = int(p.get_width() / 1000)
            x = p.get_x() + p.get_width() + 1000
            y = p.get_y() + p.get_height() / 2
            ax.annotate(str(val) + 'k', (x, y), fontsize=10)

        # OR
        # fig, ax = plt.subplots(figsize=(7, 5))
        # plt.barh(dd['CANCER_TYPE'], dd['CELL_DRUG'], color='b', align='center', alpha=0.7)
        # plt.xlabel('Total responses', fontsize=14);
        plt.savefig(outdir / 'Top{}_histogram.png'.format(top_n),
                    dpi=100,
                    bbox_inches='tight')

        return dd

    dd = plot_tissue_hist(top_n=args.top_n)
    # --------------------------------------------------

    # --------------------------------------------------
    # (ap) break data
    # --------------------------------------------------
    # Split features and traget
    # print('\nSplit features and target.')

    # meta = df_final[['AUC', 'CELL', 'DRUG']]
    # xdata = df_final.drop(columns=['AUC', 'CELL', 'DRUG'])

    # xdata.to_parquet( outdir/'xdata.parquet' )
    # meta.to_parquet( outdir/'meta.parquet' )

    # print('Totoal DD: {}'.format( len([c for c in xdata.columns if 'DD' in c]) ))
    # print('Totoal GE: {}'.format( len([c for c in xdata.columns if 'GE' in c]) ))
    # --------------------------------------------------

    # --------------------------------------------------
    # (ap) generate train/val/test splits
    # --------------------------------------------------
    # from data_split import make_split
    # print('\nSplit train/val/test.')
    # args['cell_fea'] = 'GE'
    # args['drug_fea'] = 'DD'
    # args['te_method'] = 'simple'
    # args['cv_method'] = 'simple'
    # args['te_size'] = 0.1
    # args['vl_size'] = 0.1
    # args['n_jobs'] = 4
    # make_split(xdata=xdata, meta=meta, outdir=outdir, args=args)
    # --------------------------------------------------
    lg.kill_logger()
    print('Done.')
Example #9
0
def run(args):
    dirpath = verify_dirpath(args['dirpath'])
    te_size = split_size(args['te_size'])
    fea_list = args['cell_fea'] + args['drug_fea']

    # Hard split
    split_on = None if args['split_on'] is None else args['split_on'].upper()
    cv_method = 'simple' if split_on is None else 'group'
    te_method = cv_method

    # TODO: this needs to be improved
    mltype = 'reg'  # required for the splits (stratify in case of classification)

    # -----------------------------------------------
    #       Create (outdir and) logger
    # -----------------------------------------------
    outdir = create_outdir(dirpath, args)
    args['outdir'] = str(outdir)
    lg = Logger(outdir / 'data_splitter_logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')
    dump_dict(args, outpath=outdir / 'data_splitter_args.txt')  # dump args.

    # -----------------------------------------------
    #       Load and break data
    # -----------------------------------------------
    lg.logger.info('\nLoad master dataset.')
    # files = list(dirpath.glob('**/*.parquet'))
    files = list(dirpath.glob('./*.parquet'))
    if len(files) > 0:
        data = pd.read_parquet(
            files[0])  # TODO: assumes that there is only one data file
    lg.logger.info('data.shape {}'.format(data.shape))

    # Split features and traget, and dump to file
    lg.logger.info('\nSplit features and meta.')
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_')
    meta = data.drop(columns=xdata.columns)
    xdata.to_parquet(outdir / 'xdata.parquet')
    meta.to_parquet(outdir / 'meta.parquet')

    lg.logger.info('Total DD: {}'.format(
        len([c for c in xdata.columns if 'DD_' in c])))
    lg.logger.info('Total GE: {}'.format(
        len([c for c in xdata.columns if 'GE_' in c])))
    lg.logger.info('Unique cells: {}'.format(meta['CELL'].nunique()))
    lg.logger.info('Unique drugs: {}'.format(meta['DRUG'].nunique()))
    # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger)

    plot_hist(meta['AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_all.png')

    # -----------------------------------------------
    #       Generate Hold-Out split (train/val/test)
    # -----------------------------------------------
    """ First, we split the data into train and test. The remaining of train set is further
    splitted into train and validation.
    """
    lg.logger.info('\n{}'.format('-' * 50))
    lg.logger.info('Split into hold-out train/val/test')
    lg.logger.info('{}'.format('-' * 50))

    # Note that we don't shuffle the original dataset, but rather we create a vector array of
    # representative indices.
    np.random.seed(args['seed'])
    idx_vec = np.random.permutation(data.shape[0])

    # Create splitter that splits the full dataset into tr and te
    te_folds = int(1 / te_size)
    te_splitter = cv_splitter(cv_method=te_method,
                              cv_folds=te_folds,
                              test_size=None,
                              mltype=mltype,
                              shuffle=False,
                              random_state=args['seed'])

    te_grp = None if split_on is None else meta[split_on].values[idx_vec]
    if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

    # Split tr into tr and te
    tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
    tr_id = idx_vec[
        tr_id]  # adjust the indices! we'll split the remaining tr into te and vl
    te_id = idx_vec[te_id]  # adjust the indices!

    # Update a vector array that excludes the test indices
    idx_vec_ = tr_id
    del tr_id

    # Define vl_size while considering the new full size of the available samples
    vl_size = te_size / (1 - te_size)
    cv_folds = int(1 / vl_size)

    # Create splitter that splits tr into tr and vl
    cv = cv_splitter(cv_method=cv_method,
                     cv_folds=cv_folds,
                     test_size=None,
                     mltype=mltype,
                     shuffle=False,
                     random_state=args['seed'])

    cv_grp = None if split_on is None else meta[split_on].values[idx_vec_]
    if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)

    # Split tr into tr and vl
    tr_id, vl_id = next(cv.split(idx_vec_, groups=cv_grp))
    tr_id = idx_vec_[tr_id]  # adjust the indices!
    vl_id = idx_vec_[vl_id]  # adjust the indices!

    # Dump tr, vl, te indices
    np.savetxt(outdir / '1fold_tr_id.csv',
               tr_id.reshape(-1, 1),
               fmt='%d',
               delimiter='',
               newline='\n')
    np.savetxt(outdir / '1fold_vl_id.csv',
               vl_id.reshape(-1, 1),
               fmt='%d',
               delimiter='',
               newline='\n')
    np.savetxt(outdir / '1fold_te_id.csv',
               te_id.reshape(-1, 1),
               fmt='%d',
               delimiter='',
               newline='\n')

    lg.logger.info('Train samples {} ({:.2f}%)'.format(
        len(tr_id), 100 * len(tr_id) / xdata.shape[0]))
    lg.logger.info('Val   samples {} ({:.2f}%)'.format(
        len(vl_id), 100 * len(vl_id) / xdata.shape[0]))
    lg.logger.info('Test  samples {} ({:.2f}%)'.format(
        len(te_id), 100 * len(te_id) / xdata.shape[0]))

    # Confirm that group splits are correct (no intersection)
    grp_col = 'CELL' if split_on is None else split_on
    print_intersection_on_var(meta,
                              tr_id=tr_id,
                              vl_id=vl_id,
                              te_id=te_id,
                              grp_col=grp_col,
                              logger=lg.logger)

    plot_hist(meta.loc[tr_id, 'AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_train.png')
    plot_hist(meta.loc[vl_id, 'AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_val.png')
    plot_hist(meta.loc[te_id, 'AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_test.png')

    plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'],
                      yvl=meta.loc[vl_id, 'AUC'],
                      title='ytr_yvl_dist',
                      outpath=outdir / 'ytr_yvl_dist.png')

    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    """ K-fold CV split is applied with multiple values of k. For each set of splits k, the dataset is divided
    into k splits, where each split results in train and val samples. In this process, we take the train samples,
    and divide them into a smaller subset of train samples and test samples.
    """
    lg.logger.info('\n{}'.format('-' * 50))
    lg.logger.info(f"Split into multiple sets k-fold splits (multiple k's)")
    lg.logger.info('{}'.format('-' * 50))
    cv_folds_list = [5, 7, 10, 15, 20]

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\n----- {cv_folds}-fold splits -----')

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method,
                         cv_folds=cv_folds,
                         test_size=None,
                         mltype=mltype,
                         shuffle=False,
                         random_state=args['seed'])

        cv_grp = None if split_on is None else meta[split_on].values[idx_vec]
        if is_string_dtype(cv_grp):
            cv_grp = LabelEncoder().fit_transform(cv_grp)

        tr_folds, vl_folds, te_folds = {}, {}, {}

        # Start CV iters (this for loop generates the tr and vl splits)
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec,
                                                       groups=cv_grp)):
            lg.logger.info(f'\nFold {fold+1}')
            tr_id = idx_vec[tr_id]  # adjust the indices!
            vl_id = idx_vec[vl_id]  # adjust the indices!

            # -----------------
            # Store vl ids
            vl_folds[fold] = vl_id.tolist()

            # Update te_size to the new full size of available samples
            te_size_ = len(vl_id) / len(idx_vec) / (1 -
                                                    len(vl_id) / len(idx_vec))
            te_folds_split = int(1 / te_size_)

            # Create splitter that splits tr into tr and te
            te_splitter = cv_splitter(cv_method=te_method,
                                      cv_folds=te_folds_split,
                                      test_size=None,
                                      mltype=mltype,
                                      shuffle=False,
                                      random_state=args['seed'])

            # Update the index array
            idx_vec_ = tr_id
            del tr_id

            te_grp = None if split_on is None else meta[split_on].values[
                idx_vec_]
            if is_string_dtype(te_grp):
                te_grp = LabelEncoder().fit_transform(te_grp)

            # Split tr into tr and te
            tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            tr_id = idx_vec_[tr_id]  # adjust the indices!
            te_id = idx_vec_[te_id]  # adjust the indices!

            # Store tr and te ids
            tr_folds[fold] = tr_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format(
                len(tr_id), 100 * len(tr_id) / xdata.shape[0]))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format(
                len(vl_id), 100 * len(vl_id) / xdata.shape[0]))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format(
                len(te_id), 100 * len(te_id) / xdata.shape[0]))

            # Confirm that group splits are correct (no intersection)
            grp_col = 'CELL' if split_on is None else split_on
            print_intersection_on_var(meta,
                                      tr_id=tr_id,
                                      vl_id=vl_id,
                                      te_id=te_id,
                                      grp_col=grp_col,
                                      logger=lg.logger)

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in tr_folds.items()]))
        vl_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in vl_folds.items()]))
        te_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in te_folds.items()]))

        # Dump
        tr_folds.to_csv(outdir / f'{cv_folds}fold_tr_id.csv', index=False)
        vl_folds.to_csv(outdir / f'{cv_folds}fold_vl_id.csv', index=False)
        te_folds.to_csv(outdir / f'{cv_folds}fold_te_id.csv', index=False)

    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    """
    # TODO: consider to separate the pipeline hold-out and k-fold splits!
    # Since we shuffled the dataset, we don't need to shuffle again.
    # np.random.seed(args['seed'])
    # idx_vec = np.random.permutation(xdata.shape[0])
    idx_vec = np.array(range(xdata.shape[0]))

    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
                         mltype=mltype, shuffle=False, random_state=args['seed'])

        cv_grp = None if split_on is None else meta[split_on].values[idx_vec]
        if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    
        tr_folds, vl_folds, te_folds = {}, {}, {}
        
        # Start CV iters (this for loop generates the tr and vl splits)
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
            lg.logger.info(f'\nFold {fold}')
            tr_id = idx_vec[tr_id] # adjust the indices!
            vl_id = idx_vec[vl_id] # adjust the indices!

            # -----------------
            # Store vl ids
            vl_folds[fold] = vl_id.tolist()

            # Update te_size to the new full size of available samples
            if cv_folds == 1:
                te_size_ = vl_size / (1 - vl_size)
            else:
                te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec))

            # Create splitter that splits tr into tr and te
            te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_,
                                      mltype=mltype, shuffle=False, random_state=args['seed'])

            # Update the index array
            idx_vec_ = tr_id; del tr_id

            te_grp = None if split_on is None else meta[split_on].values[idx_vec_]
            if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

            # Split tr into tr and te
            tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            tr_id = idx_vec_[tr_id] # adjust the indices!
            te_id = idx_vec_[te_id] # adjust the indices!

            # Store tr and te ids
            tr_folds[fold] = tr_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] ))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] ))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] ))

            # Confirm that group splits are correct
            if split_on is not None:
                tr_grp_unq = set(meta.loc[tr_id, split_on])
                vl_grp_unq = set(meta.loc[vl_id, split_on])
                te_grp_unq = set(meta.loc[te_id, split_on])
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
        vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
        te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ]))

        # Dump
        tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
        vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
        te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False )
        
        # Plot target dist only for the 1-fold case
        # TODO: consider to plot dist for all k-fold where k>1
        if cv_folds==1 and fold==0:
            plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
            plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
            plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
            
            plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
                              title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
            
            # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
            # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
            # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')            
    """

    #     # -----------------------------------------------
    #     #       Train-test split
    #     # -----------------------------------------------
    #     np.random.seed(SEED)
    #     idx_vec = np.random.permutation(xdata.shape[0])
    #
    #     if te_method is not None:
    #         lg.logger.info('\nSplit train/test.')
    #         te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size,
    #                                   mltype=mltype, shuffle=False, random_state=SEED)
    #
    #         te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None
    #         if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)
    #
    #         # Split train/test
    #         tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
    #         tr_id = idx_vec[tr_id] # adjust the indices!
    #         te_id = idx_vec[te_id] # adjust the indices!
    #
    #         pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] )
    #         pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] )
    #
    #         lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] ))
    #         lg.logger.info('Test:  {:.1f}'.format( len(te_id)/xdata.shape[0] ))
    #
    #         # Update the master idx vector for the CV splits
    #         idx_vec = tr_id
    #
    #         # Plot dist of responses (TODO: this can be done to all response metrics)
    #         # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values,
    #         #         title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png')
    #
    #         # Confirm that group splits are correct
    #         if te_method=='group' and grp_by_col is not None:
    #             tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
    #             te_grp_unq = set(meta.loc[te_id, grp_by_col])
    #             lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
    #             lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.')
    #
    #         # Update vl_size to effective vl_size
    #         vl_size = vl_size * xdata.shape[0]/len(tr_id)
    #
    #         # Plot hist te
    #         pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')
    #         plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
    #
    #         del tr_id, te_id
    #
    #
    #     # -----------------------------------------------
    #     #       Generate CV splits
    #     # -----------------------------------------------
    #     cv_folds_list = [1, 5, 7, 10, 15, 20, 25]
    #     lg.logger.info(f'\nStart CV splits ...')
    #
    #     for cv_folds in cv_folds_list:
    #         lg.logger.info(f'\nCV folds: {cv_folds}')
    #
    #         cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
    #                          mltype=mltype, shuffle=False, random_state=SEED)
    #
    #         cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None
    #         if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    #
    #         tr_folds = {}
    #         vl_folds = {}
    #
    #         # Start CV iters
    #         for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
    #             tr_id = idx_vec[tr_id] # adjust the indices!
    #             vl_id = idx_vec[vl_id] # adjust the indices!
    #
    #             tr_folds[fold] = tr_id.tolist()
    #             vl_folds[fold] = vl_id.tolist()
    #
    #             # Confirm that group splits are correct
    #             if cv_method=='group' and grp_by_col is not None:
    #                 tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
    #                 vl_grp_unq = set(meta.loc[vl_id, grp_by_col])
    #                 lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
    #                 lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
    #                 lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
    #
    #         # Convet to df
    #         # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
    #         # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T
    #         # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
    #         tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
    #         vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
    #
    #         # Dump
    #         tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
    #         vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
    #
    #         # Plot target dist only for the 1-fold case
    #         if cv_folds==1 and fold==0:
    #             plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
    #             plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
    #
    #             plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
    #                               title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
    #
    #             pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
    #             pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')

    lg.kill_logger()
    print('Done.')
Example #10
0
def run(args):
    dirpath = verify_dirpath(args['dirpath'])

    clr_keras_kwargs = {'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'],
                        'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma']}

    # ML type ('reg' or 'cls')
    if 'reg' in args['model_name']:
        mltype = 'reg'
    elif 'cls' in args['model_name']:
        mltype = 'cls'
    else:
        raise ValueError("model_name must contain 'reg' or 'cls'.")

    # Find out which metadata field was used for hard split (cell, drug, or none)
    f = [f for f in dirpath.glob('*args.txt')][0]
    with open(f) as f: lines = f.readlines()
    split_on = [l.split(':')[-1].strip() for l in lines if 'split_on' in l][0]
    args['split_on'] = split_on.lower()
        
    # Define metrics
    # metrics = {'r2': 'r2',
    #            'neg_mean_absolute_error': 'neg_mean_absolute_error', #sklearn.metrics.neg_mean_absolute_error,
    #            'neg_median_absolute_error': 'neg_median_absolute_error', #sklearn.metrics.neg_median_absolute_error,
    #            'neg_mean_squared_error': 'neg_mean_squared_error', #sklearn.metrics.neg_mean_squared_error,
    #            'reg_auroc_score': utils.reg_auroc_score}
    
    
    # -----------------------------------------------
    #       Load data and pre-proc
    # -----------------------------------------------
    xdata = read_data_file( dirpath/'xdata.parquet', 'parquet' )
    meta  = read_data_file( dirpath/'meta.parquet', 'parquet' )
    ydata = meta[[ args['target_name'] ]]

    tr_id = read_data_file( dirpath/'{}fold_tr_id.csv'.format(args['cv_folds']) )
    vl_id = read_data_file( dirpath/'{}fold_vl_id.csv'.format(args['cv_folds']) )
    te_id = read_data_file( dirpath/'{}fold_te_id.csv'.format(args['cv_folds']) )

    src = str(dirpath.parent).split('/')[-1].split('.')[0]


    # -----------------------------------------------
    #       Create outdir and logger
    # -----------------------------------------------
    outdir = create_outdir(OUTDIR, args, src)
    args['outdir'] = str(outdir)
    lg = Logger(outdir/'logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')
    dump_dict(args, outpath=outdir/'args.txt') # dump args
    
    
    # -----------------------------------------------
    #       Data preprocessing
    # -----------------------------------------------
    xdata = scale_fea(xdata=xdata, scaler_name=args['scaler'])  # scale features
    
    
    # -----------------------------------------------
    #      ML model configs
    # -----------------------------------------------
    if args['model_name'] == 'lgb_reg':
        framework = 'lightgbm'
        init_kwargs = {'n_estimators': args['gbm_trees'], 'max_depth': args['gbm_max_depth'],
                       'learning_rate': args['gbm_lr'], 'num_leaves': args['gbm_leaves'],
                       'n_jobs': args['n_jobs'], 'random_state': args['seed']}
        fit_kwargs = {'verbose': False}

    elif args['model_name'] == 'rf_reg':
        framework = 'sklearn'
        init_kwargs = {'n_estimators': args['rf_trees'], 'n_jobs': args['n_jobs'], 'random_state': args['seed']}
        fit_kwargs = {}

    elif args['model_name'] == 'nn_reg0' or 'nn_reg1' or 'nn_reg_layer_less' or 'nn_reg_layer_more' or 'nn_reg_neuron_less' or 'nn_reg_neuron_more':
        framework = 'keras'
        init_kwargs = {'input_dim': xdata.shape[1], 'dr_rate': args['dr_rate'], 'opt_name': args['opt'],
                       'lr': args['lr'], 'batchnorm': args['batchnorm'], 'logger': lg.logger}
        fit_kwargs = {'batch_size': args['batch_size'], 'epochs': args['epochs'], 'verbose': 1}  # 'validation_split': 0.1


    # -----------------------------------------------
    #      Learning curve 
    # -----------------------------------------------
    lg.logger.info('\n\n{}'.format('-'*50))
    lg.logger.info(f'Learning curves {src} ...')
    lg.logger.info('-'*50)

    lrn_crv_init_kwargs = { 'cv': None, 'cv_lists': (tr_id, vl_id, te_id), 'cv_folds_arr': args['cv_folds_arr'],
            'shard_step_scale': args['shard_step_scale'], 'n_shards': args['n_shards'], 'min_shard': args['min_shard'], 'max_shard': args['max_shard'],
            'shards_arr': args['shards_arr'], 'args': args, 'logger': lg.logger, 'outdir': outdir}

    lrn_crv_trn_kwargs = { 'framework': framework, 'mltype': mltype, 'model_name': args['model_name'],
            'init_kwargs': init_kwargs, 'fit_kwargs': fit_kwargs, 'clr_keras_kwargs': clr_keras_kwargs,
            'n_jobs': args['n_jobs'], 'random_state': args['seed'] }

    t0 = time()
    lc = LearningCurve( X=xdata, Y=ydata, meta=meta, **lrn_crv_init_kwargs )
    lrn_crv_scores = lc.trn_learning_curve( **lrn_crv_trn_kwargs )
    lg.logger.info('Runtime: {:.1f} hrs'.format( (time()-t0)/3600) )


    # -------------------------------------------------
    # Learning curve (sklearn method)
    # Problem! cannot log multiple metrics.
    # -------------------------------------------------
    """
    lg.logger.info('\nStart learning curve (sklearn method) ...')
    # Define params
    metric_name = 'neg_mean_absolute_error'
    base = 10
    train_sizes_frac = np.logspace(0.0, 1.0, lc_ticks, endpoint=True, base=base)/base

    # Run learning curve
    t0 = time()
    lrn_curve_scores = learning_curve(
        estimator=model.model, X=xdata, y=ydata,
        train_sizes=train_sizes_frac, cv=cv, groups=groups,
        scoring=metric_name,
        n_jobs=n_jobs, exploit_incremental_learning=False,
        random_state=SEED, verbose=1, shuffle=False)
    lg.logger.info('Runtime: {:.1f} mins'.format( (time()-t0)/60) )

    # Dump results
    # lrn_curve_scores = utils.cv_scores_to_df(lrn_curve_scores, decimals=3, calc_stats=False) # this func won't work
    # lrn_curve_scores.to_csv(os.path.join(outdir, 'lrn_curve_scores_auto.csv'), index=False)

    # Plot learning curves
    lrn_crv.plt_learning_curve(rslt=lrn_curve_scores, metric_name=metric_name,
        title='Learning curve (target: {}, data: {})'.format(target_name, tr_sources_name),
        path=os.path.join(outdir, 'auto_learning_curve_' + target_name + '_' + metric_name + '.png'))
    """
    
    lg.kill_logger()
    del xdata, ydata
    
    print('Done.')