Beispiel #1
0
def compute_target_distribution(labels, plot=False, verbose=False):
    distribution = pd.DataFrame(labels).sum(axis=0)
    if verbose:
        print('target feature representation')
        print('-----------------------------')
        print(distribution.describe())
        print('-----------------------------')
    if plot:
        plot_hist(distribution, 'target feature representation distribution')
    return distribution
Beispiel #2
0
            print(
                'Agent: {}, avg reward: {:0.2f}, avg action: {:0.2f}, learning rate: {:0.4f}, batch: {}'
                .format(
                    i, np.mean(test_data[3][i]), np.mean(test_data[2][i]),
                    *agent.sess.run([agent.learning_rate, agent.global_step])))

            # Batch train agent
            agent_data = [
                train_data[0], train_data[1], train_data[2][i],
                train_data[3][i]
            ]
            agent.batch_train(agent_data, save_every=99999999)
            agent.epsilon = agent.epsilon * 0.95

    plotting.plot_hist(actions[0],
                       xticks=np.arange(len(env.action_space)),
                       show=False,
                       save_path='../plots/monopolist_actions.png')

    lines_to_plot = OrderedDict([('Monopolist outcome',
                                  theoretical_opt[0]['profit'])])

    plotting.plot_rewards(avg_rewards[0],
                          lines=lines_to_plot,
                          show=False,
                          save_path='../plots/monopolist_rewards.png')
    # plotting.plot_signals(signals=[task.episode_observations[0][4]], intercept=task.episode_realized_intercepts[0][3],
    #     slope=1, show=False, save_path='../plots/signals.png')
    # some_intercepts = np.concatenate([i for i in task.episode_realized_intercepts])
    # plotting.plot_demand_curves(some_intercepts[:1000], slope=1, xmax=15, ymax=15,
    #     show=False, save_path='../plots/demand_curves.png')
def run(args):
    dirpath = Path(args['dirpath'])

    # Data splits
    # te_method = args['te_method']
    # cv_method = args['cv_method']
    # te_size = split_size(args['te_size'])
    vl_size = split_size(args['vl_size'])

    # Features 
    cell_fea = args['cell_fea']
    drug_fea = args['drug_fea']
    fea_list = cell_fea + drug_fea
    
    # Other params
    n_jobs = args['n_jobs']

    # Hard split
    # grp_by_col = None
    split_on = args['split_on'] if args['split_on'] is None else args['split_on'].upper()
    cv_method = 'simple' if split_on is None else 'group'
    te_method = cv_method 

    # TODO: this needs to be improved
    mltype = 'reg'  # required for the splits (stratify in case of classification)
    
    
    # -----------------------------------------------
    #       Create outdir and logger
    # -----------------------------------------------
    outdir = Path( str(dirpath) + '_splits' )
    os.makedirs(outdir, exist_ok=True)
    
    lg = Logger(outdir/'splitter.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')

    # Dump args to file
    dump_dict(args, outpath=outdir/'args.txt')

    
    # -----------------------------------------------
    #       Load and break data
    # -----------------------------------------------
    lg.logger.info('\nLoad master dataset.')
    files = list(dirpath.glob('**/*.parquet'))
    if len(files) > 0: data = pd.read_parquet( files[0], engine='auto', columns=None ) # TODO: assumes that there is only one data file
    lg.logger.info('data.shape {}'.format(data.shape))

    # Split features and traget, and dump to file
    lg.logger.info('\nSplit features and meta.')
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_')
    meta = data.drop(columns=xdata.columns)
    xdata.to_parquet( outdir/'xdata.parquet' )
    meta.to_parquet( outdir/'meta.parquet' )
    
    lg.logger.info('Total DD: {}'.format( len([c for c in xdata.columns if 'DD_' in c]) ))
    lg.logger.info('Total GE: {}'.format( len([c for c in xdata.columns if 'GE_' in c]) ))
    lg.logger.info('Unique cells: {}'.format( meta['CELL'].nunique() ))
    lg.logger.info('Unique drugs: {}'.format( meta['DRUG'].nunique() ))
    # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger)

    plot_hist(meta['AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_all.png')
    

    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    """
    np.random.seed(SEED)
    idx_vec = np.random.permutation(xdata.shape[0])

    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        # Convert vl_size into int
        # vl_size_int = int(vl_size*len(idx_vec))

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
                         mltype=mltype, shuffle=False, random_state=SEED)

        # Command meta[split_on].values[idx_vec] returns the vector meta[split_on].values
        # in an order specified by idx_vec
        # For example:
        # aa = meta[split_on][:3]
        # print(aa.values)
        # print(aa.values[[0,2,1]])
        # m = meta[split_on]
        cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None
        if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    
        tr_folds = {} 
        vl_folds = {} 
        te_folds = {}
        
        # Start CV iters
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
            lg.logger.info(f'\nFold {fold+1}')
            tr_id = idx_vec[tr_id] # adjust the indices!
            vl_id = idx_vec[vl_id] # adjust the indices!
            # t = meta.loc[tr_id, split_on]
            # v = meta.loc[vl_id, split_on]
            # print(len(vl_id)/len(idx_vec))
            
            # -----------------
            # Store tr ids
            tr_folds[fold] = tr_id.tolist()

            # Create splitter that splits vl into vl and te (splits by half)
            te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=0.5,
                                      mltype=mltype, shuffle=False, random_state=SEED)

            # Update the index array
            idx_vec_ = vl_id; del vl_id

            te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None
            if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

            # Split vl set into vl and te
            vl_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            vl_id = idx_vec_[vl_id] # adjust the indices!
            te_id = idx_vec_[te_id] # adjust the indices!
            # v = meta.loc[vl_id, split_on]
            # e = meta.loc[te_id, split_on]

            # Store vl and te ids
            vl_folds[fold] = vl_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] ))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] ))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] ))

            # Confirm that group splits are correct
            if split_on is not None:
                tr_grp_unq = set(meta.loc[tr_id, split_on])
                vl_grp_unq = set(meta.loc[vl_id, split_on])
                te_grp_unq = set(meta.loc[te_id, split_on])
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
        vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
        te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ]))

        # Dump
        tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
        vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
        te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False )
        
        # Plot target dist only for the 1-fold case
        # TODO: consider to plot dist for all k-fold where k>1
        if cv_folds==1 and fold==0:
            plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
            plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
            plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
            
            plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
                              title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
            
            # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
            # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
            # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')
    """


    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    np.random.seed(SEED)
    idx_vec = np.random.permutation(xdata.shape[0])

    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
                         mltype=mltype, shuffle=False, random_state=SEED)

        cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None
        if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    
        tr_folds = {} 
        vl_folds = {} 
        te_folds = {}
        
        # Start CV iters (this for loop generates the tr and vl splits)
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
            lg.logger.info(f'\nFold {fold}')
            tr_id = idx_vec[tr_id] # adjust the indices!
            vl_id = idx_vec[vl_id] # adjust the indices!

            # -----------------
            # Store vl ids
            vl_folds[fold] = vl_id.tolist()

            # Update te_size to the new full size of available samples
            if cv_folds == 1:
                te_size_ = vl_size / (1 - vl_size)
            else:
                te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec))

            # Create splitter that splits tr into tr and te
            te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_,
                                      mltype=mltype, shuffle=False, random_state=SEED)

            # Update the index array
            idx_vec_ = tr_id; del tr_id

            te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None
            if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

            # Split tr into tr and te
            tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            tr_id = idx_vec_[tr_id] # adjust the indices!
            te_id = idx_vec_[te_id] # adjust the indices!

            # Store tr and te ids
            tr_folds[fold] = tr_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] ))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] ))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] ))

            # Confirm that group splits are correct
            if split_on is not None:
                tr_grp_unq = set(meta.loc[tr_id, split_on])
                vl_grp_unq = set(meta.loc[vl_id, split_on])
                te_grp_unq = set(meta.loc[te_id, split_on])
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
        vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
        te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ]))

        # Dump
        tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
        vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
        te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False )
        
        # Plot target dist only for the 1-fold case
        # TODO: consider to plot dist for all k-fold where k>1
        if cv_folds==1 and fold==0:
            plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
            plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
            plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
            
            plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
                              title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
            
            # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
            # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
            # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')



#     # -----------------------------------------------
#     #       Train-test split
#     # -----------------------------------------------
#     np.random.seed(SEED)
#     idx_vec = np.random.permutation(xdata.shape[0])
# 
#     if te_method is not None:
#         lg.logger.info('\nSplit train/test.')
#         te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size,
#                                   mltype=mltype, shuffle=False, random_state=SEED)
# 
#         te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None
#         if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)
#    
#         # Split train/test
#         tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
#         tr_id = idx_vec[tr_id] # adjust the indices!
#         te_id = idx_vec[te_id] # adjust the indices!
# 
#         pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] )
#         pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] )
#         
#         lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] ))
#         lg.logger.info('Test:  {:.1f}'.format( len(te_id)/xdata.shape[0] ))
#         
#         # Update the master idx vector for the CV splits
#         idx_vec = tr_id
# 
#         # Plot dist of responses (TODO: this can be done to all response metrics)
#         # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values,
#         #         title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png')
# 
#         # Confirm that group splits are correct
#         if te_method=='group' and grp_by_col is not None:
#             tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
#             te_grp_unq = set(meta.loc[te_id, grp_by_col])
#             lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
#             lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.')
# 
#         # Update vl_size to effective vl_size
#         vl_size = vl_size * xdata.shape[0]/len(tr_id)
#         
#         # Plot hist te
#         pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')
#         plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
# 
#         del tr_id, te_id
# 
# 
#     # -----------------------------------------------
#     #       Generate CV splits
#     # -----------------------------------------------
#     cv_folds_list = [1, 5, 7, 10, 15, 20, 25]
#     lg.logger.info(f'\nStart CV splits ...')
#     
#     for cv_folds in cv_folds_list:
#         lg.logger.info(f'\nCV folds: {cv_folds}')
# 
#         cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
#                          mltype=mltype, shuffle=False, random_state=SEED)
# 
#         cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None
#         if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
#     
#         tr_folds = {}
#         vl_folds = {}
# 
#         # Start CV iters
#         for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
#             tr_id = idx_vec[tr_id] # adjust the indices!
#             vl_id = idx_vec[vl_id] # adjust the indices!
# 
#             tr_folds[fold] = tr_id.tolist()
#             vl_folds[fold] = vl_id.tolist()
# 
#             # Confirm that group splits are correct
#             if cv_method=='group' and grp_by_col is not None:
#                 tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
#                 vl_grp_unq = set(meta.loc[vl_id, grp_by_col])
#                 lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
#                 lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
#                 lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
#         
#         # Convet to df
#         # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
#         # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
#         # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
#         tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
#         vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
# 
#         # Dump
#         tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
#         vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
#         
#         # Plot target dist only for the 1-fold case
#         if cv_folds==1 and fold==0:
#             plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
#             plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
#             
#             plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
#                               title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
#             
#             pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
#             pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
 
    lg.kill_logger()
    print('Done.')
Beispiel #4
0
    def trn_learning_curve(
            self,
            framework: str = 'lightgbm',
            mltype: str = 'reg',
            model_name: str = 'lgb_reg',  # TODO! this is redundent
            init_kwargs: dict = {},
            fit_kwargs: dict = {},
            clr_keras_kwargs: dict = {},
            metrics: list = [
                'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error',
                'neg_mean_squared_error'
            ],
            n_jobs: int = 4,
            random_state: int = None,
            plot=True):
        """ 
        Args:
            framework : ml framework (keras, lightgbm, or sklearn)
            mltype : type to ml problem (reg or cls)
            init_kwargs : dict of parameters that initialize the estimator
            fit_kwargs : dict of parameters to the estimator's fit() method
            clr_keras_kwargs : 
            metrics : allow to pass a string of metrics  TODO!
        """
        self.framework = framework
        self.mltype = mltype
        self.model_name = model_name
        self.init_kwargs = init_kwargs
        self.fit_kwargs = fit_kwargs
        self.clr_keras_kwargs = clr_keras_kwargs
        self.metrics = metrics
        self.n_jobs = n_jobs
        self.random_state = random_state

        # Start nested loop of train size and cv folds
        tr_scores_all = []  # list of dicts
        vl_scores_all = []  # list of dicts
        te_scores_all = []  # list of dicts

        # Record runtime per shard
        runtime_records = []

        # CV loop
        for fold, (tr_k, vl_k, te_k) in enumerate(
                zip(self.tr_dct.keys(), self.vl_dct.keys(),
                    self.te_dct.keys())):
            fold = fold + 1
            if self.logger is not None:
                self.logger.info(f'Fold {fold}/{self.cv_folds}')

            # Get the indices for this fold
            tr_id = self.tr_dct[tr_k]
            vl_id = self.vl_dct[vl_k]
            te_id = self.te_dct[te_k]

            # Samples from this dataset are randomly sampled for training
            xtr = self.X[tr_id, :]
            # ytr = self.Y[tr_id, :]
            ytr = np.squeeze(self.Y[tr_id, :])

            # A fixed set of val samples for the current CV split
            xvl = self.X[vl_id, :]
            yvl = np.squeeze(self.Y[vl_id, :])

            # A fixed set of test samples for the current CV split
            xte = self.X[te_id, :]
            yte = np.squeeze(self.Y[te_id, :])

            # Shards loop (iterate across the dataset sizes and train)
            """
            np.random.seed(random_state)
            idx = np.random.permutation(len(xtr))
            Note that we don't shuffle the dataset another time using the commands above.
            """
            idx = np.arange(len(xtr))
            for i, tr_sz in enumerate(self.tr_shards):
                # For each shard: train model, save best model, calc tr_scores, calc_vl_scores
                if self.logger:
                    self.logger.info(
                        f'\tTrain size: {tr_sz} ({i+1}/{len(self.tr_shards)})')

                # Sequentially get a subset of samples (the input dataset X must be shuffled)
                xtr_sub = xtr[idx[:tr_sz], :]
                # ytr_sub = np.squeeze(ytr[idx[:tr_sz], :])
                ytr_sub = ytr[idx[:tr_sz]]

                # Get the estimator
                estimator = ml_models.get_model(self.model_name,
                                                init_kwargs=self.init_kwargs)
                model = estimator.model

                # Train
                # self.val_split = 0 # 0.1 # used for early stopping
                #self.eval_frac = 0.1 # 0.1 # used for early stopping
                #eval_samples = int(self.eval_frac * xvl.shape[0])
                #eval_set = (xvl[:eval_samples, :], yvl[:eval_samples]) # we don't random sample; the same eval_set is used for early stopping
                eval_set = (xvl, yvl)
                if self.framework == 'lightgbm':
                    model, trn_outdir, runtime = self.trn_lgbm_model(
                        model=model,
                        xtr_sub=xtr_sub,
                        ytr_sub=ytr_sub,
                        fold=fold,
                        tr_sz=tr_sz,
                        eval_set=eval_set)
                elif self.framework == 'sklearn':
                    model, trn_outdir, runtime = self.trn_sklearn_model(
                        model=model,
                        xtr_sub=xtr_sub,
                        ytr_sub=ytr_sub,
                        fold=fold,
                        tr_sz=tr_sz,
                        eval_set=None)
                elif self.framework == 'keras':
                    model, trn_outdir, runtime = self.trn_keras_model(
                        model=model,
                        xtr_sub=xtr_sub,
                        ytr_sub=ytr_sub,
                        fold=fold,
                        tr_sz=tr_sz,
                        eval_set=eval_set)
                elif self.framework == 'pytorch':
                    pass
                else:
                    raise ValueError(
                        f'Framework {self.framework} is not supported.')

                # Save plot of target distribution
                plot_hist(ytr_sub,
                          var_name=f'Target (Train size={tr_sz})',
                          fit=None,
                          bins=100,
                          path=trn_outdir / 'target_hist_tr.png')
                plot_hist(yvl,
                          var_name=f'Target (Val size={len(yvl)})',
                          fit=None,
                          bins=100,
                          path=trn_outdir / 'target_hist_vl.png')
                plot_hist(yte,
                          var_name=f'Target (Test size={len(yte)})',
                          fit=None,
                          bins=100,
                          path=trn_outdir / 'target_hist_te.png')

                # Calc preds and scores TODO: dump preds
                # ... training set
                y_pred, y_true = calc_preds(model,
                                            x=xtr_sub,
                                            y=ytr_sub,
                                            mltype=self.mltype)
                tr_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                tr_scores['y_avg'] = np.mean(y_pred)
                # ... val set
                y_pred, y_true = calc_preds(model,
                                            x=xvl,
                                            y=yvl,
                                            mltype=self.mltype)
                vl_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                vl_scores['y_avg'] = np.mean(y_pred)
                # ... test set
                y_pred, y_true = calc_preds(model,
                                            x=xte,
                                            y=yte,
                                            mltype=self.mltype)
                te_scores = calc_scores(y_true=y_true,
                                        y_pred=y_pred,
                                        mltype=self.mltype,
                                        metrics=None)
                te_scores['y_avg'] = np.mean(y_pred)

                del estimator, model

                # Save predictions (need to include metadata)
                # TODO
                pass

                # Store runtime
                runtime_records.append((fold, tr_sz, runtime))

                # Add metadata
                # tr_scores['tr_set'] = True
                tr_scores['set'] = 'tr'
                tr_scores['fold'] = 'fold' + str(fold)
                tr_scores['tr_size'] = tr_sz

                # vl_scores['tr_set'] = False
                vl_scores['set'] = 'vl'
                vl_scores['fold'] = 'fold' + str(fold)
                vl_scores['tr_size'] = tr_sz

                # te_scores['tr_set'] = False
                te_scores['set'] = 'te'
                te_scores['fold'] = 'fold' + str(fold)
                te_scores['tr_size'] = tr_sz

                # Append scores (dicts)
                tr_scores_all.append(tr_scores)
                vl_scores_all.append(vl_scores)
                te_scores_all.append(te_scores)

                # Dump intermediate scores
                # TODO: test this!
                scores_tmp = pd.concat([
                    scores_to_df([tr_scores]),
                    scores_to_df([vl_scores]),
                    scores_to_df([te_scores])
                ],
                                       axis=0)
                scores_tmp.to_csv(trn_outdir / ('scores_tmp.csv'), index=False)
                del trn_outdir, scores_tmp

            # Dump intermediate results (this is useful if the run terminates before run ends)
            scores_all_df_tmp = pd.concat([
                scores_to_df(tr_scores_all),
                scores_to_df(vl_scores_all),
                scores_to_df(te_scores_all)
            ],
                                          axis=0)
            scores_all_df_tmp.to_csv(
                self.outdir / ('_lrn_crv_scores_cv' + str(fold) + '.csv'),
                index=False)

        # Scores to df
        tr_scores_df = scores_to_df(tr_scores_all)
        vl_scores_df = scores_to_df(vl_scores_all)
        te_scores_df = scores_to_df(te_scores_all)
        scores_df = pd.concat([tr_scores_df, vl_scores_df, te_scores_df],
                              axis=0)

        # Dump final results
        tr_scores_df.to_csv(self.outdir / 'tr_lrn_crv_scores.csv', index=False)
        vl_scores_df.to_csv(self.outdir / 'vl_lrn_crv_scores.csv', index=False)
        te_scores_df.to_csv(self.outdir / 'te_lrn_crv_scores.csv', index=False)
        scores_df.to_csv(self.outdir / 'lrn_crv_scores.csv', index=False)

        # Runtime df
        runtime_df = pd.DataFrame.from_records(
            runtime_records, columns=['fold', 'tr_sz', 'time'])
        runtime_df.to_csv(self.outdir / 'runtime.csv', index=False)

        # Plot learning curves
        if plot:
            plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir)
            plot_lrn_crv_all_metrics(scores_df,
                                     outdir=self.outdir,
                                     xtick_scale='log2',
                                     ytick_scale='log2')
            plot_runtime(runtime_df,
                         outdir=self.outdir,
                         xtick_scale='log2',
                         ytick_scale='log2')

        return scores_df
Beispiel #5
0
def main(cfg):
    # setting up output directories, and writing to stdout
    make_dirs(cfg.stdout_dir, replace=False)
    if cfg.train:
        run_type = 'train'
    else:
        if 'weight' in cfg.prune_type.lower():
            run_type = 'weight-prune'
        else:
            run_type = 'unit-prune'
    sys.stdout = open(
        '{}/stdout_{}_{}.txt'.format(cfg.stdout_dir, cfg.model_name, run_type),
        'w')
    print(cfg)
    print('\n')
    sys.stdout.flush()

    # if train mode, replace the previous plot and ckpt directories; if in prune mode, use existing directories
    if cfg.plot:
        make_dirs(os.path.join(cfg.plot_dir, cfg.model_name),
                  replace=cfg.train)
    if cfg.save_model:
        make_dirs(os.path.join(cfg.model_dir, cfg.model_name),
                  replace=cfg.train)

    # set random seed
    if cfg.random_seed != 0:
        random_seed = cfg.random_seed
    else:
        random_seed = random.randint(1, 100000)
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)

    # set device as cuda or cpu
    if cfg.use_gpu and torch.cuda.is_available():
        # reproducibility using cuda
        torch.cuda.manual_seed(random_seed)
        cudnn.deterministic = True
        cudnn.benchmark = False
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        if cfg.use_gpu:
            print('gpu option was to <True>, but no cuda device was found')
            print('\n')

    # datasets and dataloaders
    # normalizing training and validation images to [0, 1] suffices for the purposes of our research objective
    # in training, <drop_last> minibatch in an epoch set to <True> for simplicity in tracking training performance
    dataset_train = MNIST(root='./data/mnist',
                          train=True,
                          download=True,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]),
                          target_transform=None)
    dataloader_train = DataLoader(dataset=dataset_train,
                                  batch_size=cfg.batch_size,
                                  shuffle=cfg.shuffle,
                                  num_workers=cfg.num_workers,
                                  pin_memory=True,
                                  drop_last=True)

    dataset_val = MNIST(root='./data/mnist',
                        train=False,
                        download=True,
                        transform=transforms.Compose([transforms.ToTensor()]),
                        target_transform=None)
    dataloader_val = DataLoader(dataset=dataset_val,
                                batch_size=100,
                                shuffle=False,
                                num_workers=cfg.num_workers,
                                pin_memory=True,
                                drop_last=False)

    # automatically compute number of classes
    targets = np.asarray(dataset_train.targets)
    c = np.unique(targets).shape[0]

    # define model
    # weights initialized using Kaiming uniform (He initialization)
    # number of units per hidden layer is passed in as an argument
    net = Net(np.product(cfg.img_size), c, cfg.units).to(device)

    criterion = nn.CrossEntropyLoss()

    if cfg.train:
        # training mode

        if cfg.use_sgd:
            optimizer = optim.SGD(params=net.parameters(),
                                  lr=cfg.lr,
                                  momentum=cfg.momentum,
                                  nesterov=cfg.use_nesterov)
        else:
            optimizer = optim.Adam(params=net.parameters(),
                                   lr=cfg.lr,
                                   betas=(cfg.beta1, cfg.beta2))

        # tracking training and validation stats over epochs
        epochs = []
        train_loss_epochs, val_loss_epochs = [], []
        train_acc_epochs, val_acc_epochs = [], []

        # best model is defined as model with best performing validation loss
        best_loss = float('inf')
        for epoch in range(cfg.epochs):
            # tracking training and validation stats over a given epoch
            train_loss_epoch, val_loss_epoch = [], []
            train_acc_epoch, val_acc_epoch = [], []

            # training set
            for i, (x, y) in enumerate(dataloader_train):
                x, y = x.to(device), y.to(device)

                optimizer.zero_grad()
                logits = net(x)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()

                acc = calculate_acc(logits, y)

                append((train_loss_epoch, loss.item()),
                       (train_acc_epoch, acc.item()))

            # validation set
            with torch.no_grad():
                for i, (x, y) in enumerate(dataloader_val):
                    x, y = x.to(device), y.to(device)

                    logits = net(x)
                    loss = criterion(logits, y)

                    acc = calculate_acc(logits, y)

                    append((val_loss_epoch, loss.item()),
                           (val_acc_epoch, acc.item()))

            train_loss_epoch, val_loss_epoch = get_average(
                train_loss_epoch), get_average(val_loss_epoch)
            train_acc_epoch, val_acc_epoch = get_average(
                train_acc_epoch), get_average(val_acc_epoch)

            print('train_epoch{:0=3d}_loss{:.4f}_acc{:.4f}'.format(
                epoch + 1, train_loss_epoch, train_acc_epoch))
            print('valid_epoch{:0=3d}_loss{:.4f}_acc{:.4f}'.format(
                epoch + 1, val_loss_epoch, val_acc_epoch))
            print('\n')
            sys.stdout.flush()

            if cfg.plot:
                append((epochs, epoch + 1),
                       (train_loss_epochs, train_loss_epoch),
                       (val_loss_epochs, val_loss_epoch),
                       (train_acc_epochs, train_acc_epoch),
                       (val_acc_epochs, val_acc_epoch))

                plot_line(epochs, train_loss_epochs, val_loss_epochs,
                          'Epoch Number', 'Loss', cfg)
                plot_line(epochs, train_acc_epochs, val_acc_epochs,
                          'Epoch Number', 'Accuracy', cfg)

            if val_loss_epoch < best_loss:
                best_loss = val_loss_epoch
                print('New best model at epoch {:0=3d} with val_loss {:.4f}'.
                      format(epoch + 1, best_loss))
                print('\n')
                if cfg.save_model:
                    # save model when validation loss improves
                    save_name = '{}_net_epoch{:0=3d}_val_loss{:.4f}'.format(
                        cfg.model_name, epoch + 1, best_loss)
                    torch.save(
                        net.state_dict(),
                        os.path.join(cfg.model_dir, cfg.model_name,
                                     '{}.pth'.format(save_name)))
                    with open(
                            os.path.join(cfg.model_dir, cfg.model_name,
                                         '{}.txt'.format(cfg.model_name)),
                            'w') as file:
                        file.write('{}.pth'.format(save_name))

    else:
        # pruning mode

        # checks on arguments passed in
        for k in cfg.sparsity:
            assert 0 <= k <= 1
        if cfg.use_sparse_mul:
            assert cfg.to_sparse

        # load model
        with open(
                os.path.join(cfg.model_dir, cfg.model_name,
                             '{}.txt'.format(cfg.model_name)), 'r') as file:
            load_name = file.readline()
        net.load_state_dict(
            torch.load(
                os.path.join(cfg.model_dir, cfg.model_name,
                             '{}'.format(load_name))))
        net.eval()

        # select pruning approach to use
        if 'weight' in cfg.prune_type.lower():
            prune = weight_prune
        else:
            prune = unit_prune

        sparsities = []
        val_loss_sparse, val_acc_sparse = [], []
        time_sparsities = []
        for k in cfg.sparsity:
            val_loss_k, val_acc_k = [], []
            time_k = []

            # copy network so that the sparsity changes are not additive for each k
            net_sparse = copy.deepcopy(net)

            pruned_weights = []
            # prune model, except for the last layer
            for (i, p) in enumerate(net_sparse.parameters()):
                if i < len(cfg.units):
                    original_weights = copy.deepcopy(p.data)
                    if cfg.plot:
                        # plot magnitude of original weights (for comparison to post-pruned weights)
                        plot_hist([
                            torch.abs(
                                original_weights.flatten()).cpu().numpy()
                        ], ['b'], cfg.prune_type, i + 1, k,
                                  'Non-Pruned Weight Magnitudes', 'Counts',
                                  cfg)
                    prune(p.data, k)
                    if cfg.plot:
                        # plot original magnitudes of pruned weights, and magnitudes of remaining weights, separately
                        pruned_weights_non_zero = torch.abs(
                            original_weights.flatten()[p.data.flatten() != 0])
                        pruned_weights_zeroed = torch.abs(
                            original_weights.flatten()[p.data.flatten() == 0])
                        plot_hist([
                            pruned_weights_non_zero.cpu().numpy(),
                            pruned_weights_zeroed.cpu().numpy()
                        ], ['g', 'r'], cfg.prune_type, i + 1, k,
                                  'Weight Magnitudes', 'Counts', cfg)
                        plot_hist([pruned_weights_non_zero.cpu().numpy()],
                                  ['k'], cfg.prune_type, i + 1, k,
                                  'Surviving Weight Magnitudes', 'Counts', cfg)
                if cfg.to_sparse and i < len(cfg.units):
                    pruned_weights.append(p.data.to_sparse())
                else:
                    pruned_weights.append(p.data)

            with torch.no_grad():
                for i, (x, y) in enumerate(dataloader_val):
                    x, y = x.to(device), y.to(device)

                    start = time.time()
                    logits = forward(x, pruned_weights, cfg.use_sparse_mul)
                    end = time.time()
                    loss = criterion(logits, y)

                    acc = calculate_acc(logits, y)

                    append((val_loss_k, loss.item()), (val_acc_k, acc.item()),
                           (time_k, end - start))

            val_loss_k, val_acc_k, time_k = get_average(
                val_loss_k), get_average(val_acc_k), get_average(time_k)

            print('valid_{}_k{:.2f}_loss{:.4f}_acc{:.4f}'.format(
                run_type, k, val_loss_k, val_acc_k))
            print('valid_{}_k{:.2f}_time/minibatch{:.6f}'.format(
                run_type, k, time_k))
            print('\n')
            sys.stdout.flush()

            if cfg.plot:
                append((sparsities, k), (val_loss_sparse, val_loss_k),
                       (val_acc_sparse, val_acc_k), (time_sparsities, time_k))

                plot_line(sparsities, [], val_loss_sparse,
                          'Sparsity {} Prune'.format(cfg.prune_type), 'Loss',
                          cfg)
                plot_line(sparsities, [], val_acc_sparse,
                          'Sparsity {} Prune'.format(cfg.prune_type),
                          'Accuracy', cfg)
                plot_line(sparsities, [], time_sparsities,
                          'Sparsity {} Prune'.format(cfg.prune_type), 'Time',
                          cfg)

            if cfg.save_model:
                torch.save(
                    net_sparse.state_dict(),
                    os.path.join(
                        cfg.model_dir, cfg.model_name,
                        '{}_sparse_net_{}_val_loss{:.4f}.pth'.format(
                            cfg.model_name, run_type, val_loss_k)))
Beispiel #6
0
                    i, np.mean(test_data[3][i]), np.mean(test_data[2][i]),
                    *agent.sess.run([agent.learning_rate, agent.global_step])))

            # Batch train agent
            agent_data = [
                train_data[0], train_data[1], train_data[2][i],
                train_data[3][i]
            ]
            agent.batch_train(agent_data)
            agent.epsilon = agent.epsilon * 0.975
            agent.save(name='duopolist_{}'.format(i), step=step)

    for i in range(len(agents_list)):
        plotting.plot_hist(
            actions[i],
            xticks=np.arange(len(env.action_space)),
            show=False,
            save_path='../plots/duopolist_{}_actions.png'.format(i))

        lines_to_plot = OrderedDict([
            ('Competitive outcome', duopoly_outcome_comp[i]['profit']),
            ('Collusive outcome', duopoly_outcome_coll[i]['profit']),
            ('Deviation outcome', duopoly_outcome_deviate[i]['profit'])
        ])

        plotting.plot_rewards(
            avg_rewards[i],
            lines=lines_to_plot,
            show=False,
            save_path='../plots/duopolist_{}_rewards.png'.format(i))
Beispiel #7
0
def run(args):
    dirpath = verify_dirpath(args['dirpath'])
    te_size = split_size(args['te_size'])
    fea_list = args['cell_fea'] + args['drug_fea']

    # Hard split
    split_on = None if args['split_on'] is None else args['split_on'].upper()
    cv_method = 'simple' if split_on is None else 'group'
    te_method = cv_method

    # TODO: this needs to be improved
    mltype = 'reg'  # required for the splits (stratify in case of classification)

    # -----------------------------------------------
    #       Create (outdir and) logger
    # -----------------------------------------------
    outdir = create_outdir(dirpath, args)
    args['outdir'] = str(outdir)
    lg = Logger(outdir / 'data_splitter_logfile.log')
    lg.logger.info(f'File path: {filepath}')
    lg.logger.info(f'\n{pformat(args)}')
    dump_dict(args, outpath=outdir / 'data_splitter_args.txt')  # dump args.

    # -----------------------------------------------
    #       Load and break data
    # -----------------------------------------------
    lg.logger.info('\nLoad master dataset.')
    # files = list(dirpath.glob('**/*.parquet'))
    files = list(dirpath.glob('./*.parquet'))
    if len(files) > 0:
        data = pd.read_parquet(
            files[0])  # TODO: assumes that there is only one data file
    lg.logger.info('data.shape {}'.format(data.shape))

    # Split features and traget, and dump to file
    lg.logger.info('\nSplit features and meta.')
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_')
    meta = data.drop(columns=xdata.columns)
    xdata.to_parquet(outdir / 'xdata.parquet')
    meta.to_parquet(outdir / 'meta.parquet')

    lg.logger.info('Total DD: {}'.format(
        len([c for c in xdata.columns if 'DD_' in c])))
    lg.logger.info('Total GE: {}'.format(
        len([c for c in xdata.columns if 'GE_' in c])))
    lg.logger.info('Unique cells: {}'.format(meta['CELL'].nunique()))
    lg.logger.info('Unique drugs: {}'.format(meta['DRUG'].nunique()))
    # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger)

    plot_hist(meta['AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_all.png')

    # -----------------------------------------------
    #       Generate Hold-Out split (train/val/test)
    # -----------------------------------------------
    """ First, we split the data into train and test. The remaining of train set is further
    splitted into train and validation.
    """
    lg.logger.info('\n{}'.format('-' * 50))
    lg.logger.info('Split into hold-out train/val/test')
    lg.logger.info('{}'.format('-' * 50))

    # Note that we don't shuffle the original dataset, but rather we create a vector array of
    # representative indices.
    np.random.seed(args['seed'])
    idx_vec = np.random.permutation(data.shape[0])

    # Create splitter that splits the full dataset into tr and te
    te_folds = int(1 / te_size)
    te_splitter = cv_splitter(cv_method=te_method,
                              cv_folds=te_folds,
                              test_size=None,
                              mltype=mltype,
                              shuffle=False,
                              random_state=args['seed'])

    te_grp = None if split_on is None else meta[split_on].values[idx_vec]
    if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

    # Split tr into tr and te
    tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
    tr_id = idx_vec[
        tr_id]  # adjust the indices! we'll split the remaining tr into te and vl
    te_id = idx_vec[te_id]  # adjust the indices!

    # Update a vector array that excludes the test indices
    idx_vec_ = tr_id
    del tr_id

    # Define vl_size while considering the new full size of the available samples
    vl_size = te_size / (1 - te_size)
    cv_folds = int(1 / vl_size)

    # Create splitter that splits tr into tr and vl
    cv = cv_splitter(cv_method=cv_method,
                     cv_folds=cv_folds,
                     test_size=None,
                     mltype=mltype,
                     shuffle=False,
                     random_state=args['seed'])

    cv_grp = None if split_on is None else meta[split_on].values[idx_vec_]
    if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)

    # Split tr into tr and vl
    tr_id, vl_id = next(cv.split(idx_vec_, groups=cv_grp))
    tr_id = idx_vec_[tr_id]  # adjust the indices!
    vl_id = idx_vec_[vl_id]  # adjust the indices!

    # Dump tr, vl, te indices
    np.savetxt(outdir / '1fold_tr_id.csv',
               tr_id.reshape(-1, 1),
               fmt='%d',
               delimiter='',
               newline='\n')
    np.savetxt(outdir / '1fold_vl_id.csv',
               vl_id.reshape(-1, 1),
               fmt='%d',
               delimiter='',
               newline='\n')
    np.savetxt(outdir / '1fold_te_id.csv',
               te_id.reshape(-1, 1),
               fmt='%d',
               delimiter='',
               newline='\n')

    lg.logger.info('Train samples {} ({:.2f}%)'.format(
        len(tr_id), 100 * len(tr_id) / xdata.shape[0]))
    lg.logger.info('Val   samples {} ({:.2f}%)'.format(
        len(vl_id), 100 * len(vl_id) / xdata.shape[0]))
    lg.logger.info('Test  samples {} ({:.2f}%)'.format(
        len(te_id), 100 * len(te_id) / xdata.shape[0]))

    # Confirm that group splits are correct (no intersection)
    grp_col = 'CELL' if split_on is None else split_on
    print_intersection_on_var(meta,
                              tr_id=tr_id,
                              vl_id=vl_id,
                              te_id=te_id,
                              grp_col=grp_col,
                              logger=lg.logger)

    plot_hist(meta.loc[tr_id, 'AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_train.png')
    plot_hist(meta.loc[vl_id, 'AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_val.png')
    plot_hist(meta.loc[te_id, 'AUC'],
              var_name='AUC',
              fit=None,
              bins=100,
              path=outdir / 'AUC_hist_test.png')

    plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'],
                      yvl=meta.loc[vl_id, 'AUC'],
                      title='ytr_yvl_dist',
                      outpath=outdir / 'ytr_yvl_dist.png')

    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    """ K-fold CV split is applied with multiple values of k. For each set of splits k, the dataset is divided
    into k splits, where each split results in train and val samples. In this process, we take the train samples,
    and divide them into a smaller subset of train samples and test samples.
    """
    lg.logger.info('\n{}'.format('-' * 50))
    lg.logger.info(f"Split into multiple sets k-fold splits (multiple k's)")
    lg.logger.info('{}'.format('-' * 50))
    cv_folds_list = [5, 7, 10, 15, 20]

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\n----- {cv_folds}-fold splits -----')

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method,
                         cv_folds=cv_folds,
                         test_size=None,
                         mltype=mltype,
                         shuffle=False,
                         random_state=args['seed'])

        cv_grp = None if split_on is None else meta[split_on].values[idx_vec]
        if is_string_dtype(cv_grp):
            cv_grp = LabelEncoder().fit_transform(cv_grp)

        tr_folds, vl_folds, te_folds = {}, {}, {}

        # Start CV iters (this for loop generates the tr and vl splits)
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec,
                                                       groups=cv_grp)):
            lg.logger.info(f'\nFold {fold+1}')
            tr_id = idx_vec[tr_id]  # adjust the indices!
            vl_id = idx_vec[vl_id]  # adjust the indices!

            # -----------------
            # Store vl ids
            vl_folds[fold] = vl_id.tolist()

            # Update te_size to the new full size of available samples
            te_size_ = len(vl_id) / len(idx_vec) / (1 -
                                                    len(vl_id) / len(idx_vec))
            te_folds_split = int(1 / te_size_)

            # Create splitter that splits tr into tr and te
            te_splitter = cv_splitter(cv_method=te_method,
                                      cv_folds=te_folds_split,
                                      test_size=None,
                                      mltype=mltype,
                                      shuffle=False,
                                      random_state=args['seed'])

            # Update the index array
            idx_vec_ = tr_id
            del tr_id

            te_grp = None if split_on is None else meta[split_on].values[
                idx_vec_]
            if is_string_dtype(te_grp):
                te_grp = LabelEncoder().fit_transform(te_grp)

            # Split tr into tr and te
            tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            tr_id = idx_vec_[tr_id]  # adjust the indices!
            te_id = idx_vec_[te_id]  # adjust the indices!

            # Store tr and te ids
            tr_folds[fold] = tr_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format(
                len(tr_id), 100 * len(tr_id) / xdata.shape[0]))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format(
                len(vl_id), 100 * len(vl_id) / xdata.shape[0]))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format(
                len(te_id), 100 * len(te_id) / xdata.shape[0]))

            # Confirm that group splits are correct (no intersection)
            grp_col = 'CELL' if split_on is None else split_on
            print_intersection_on_var(meta,
                                      tr_id=tr_id,
                                      vl_id=vl_id,
                                      te_id=te_id,
                                      grp_col=grp_col,
                                      logger=lg.logger)

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in tr_folds.items()]))
        vl_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in vl_folds.items()]))
        te_folds = pd.DataFrame(
            dict([(k, pd.Series(v)) for k, v in te_folds.items()]))

        # Dump
        tr_folds.to_csv(outdir / f'{cv_folds}fold_tr_id.csv', index=False)
        vl_folds.to_csv(outdir / f'{cv_folds}fold_vl_id.csv', index=False)
        te_folds.to_csv(outdir / f'{cv_folds}fold_te_id.csv', index=False)

    # -----------------------------------------------
    #       Generate CV splits (new)
    # -----------------------------------------------
    """
    # TODO: consider to separate the pipeline hold-out and k-fold splits!
    # Since we shuffled the dataset, we don't need to shuffle again.
    # np.random.seed(args['seed'])
    # idx_vec = np.random.permutation(xdata.shape[0])
    idx_vec = np.array(range(xdata.shape[0]))

    cv_folds_list = [1, 5, 7, 10, 15, 20]
    lg.logger.info(f'\nStart CV splits ...')

    for cv_folds in cv_folds_list:
        lg.logger.info(f'\nCV folds: {cv_folds}')

        # Create CV splitter
        cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
                         mltype=mltype, shuffle=False, random_state=args['seed'])

        cv_grp = None if split_on is None else meta[split_on].values[idx_vec]
        if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    
        tr_folds, vl_folds, te_folds = {}, {}, {}
        
        # Start CV iters (this for loop generates the tr and vl splits)
        for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
            lg.logger.info(f'\nFold {fold}')
            tr_id = idx_vec[tr_id] # adjust the indices!
            vl_id = idx_vec[vl_id] # adjust the indices!

            # -----------------
            # Store vl ids
            vl_folds[fold] = vl_id.tolist()

            # Update te_size to the new full size of available samples
            if cv_folds == 1:
                te_size_ = vl_size / (1 - vl_size)
            else:
                te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec))

            # Create splitter that splits tr into tr and te
            te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_,
                                      mltype=mltype, shuffle=False, random_state=args['seed'])

            # Update the index array
            idx_vec_ = tr_id; del tr_id

            te_grp = None if split_on is None else meta[split_on].values[idx_vec_]
            if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)

            # Split tr into tr and te
            tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp))
            tr_id = idx_vec_[tr_id] # adjust the indices!
            te_id = idx_vec_[te_id] # adjust the indices!

            # Store tr and te ids
            tr_folds[fold] = tr_id.tolist()
            te_folds[fold] = te_id.tolist()
            # -----------------

            lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] ))
            lg.logger.info('Val   samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] ))
            lg.logger.info('Test  samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] ))

            # Confirm that group splits are correct
            if split_on is not None:
                tr_grp_unq = set(meta.loc[tr_id, split_on])
                vl_grp_unq = set(meta.loc[vl_id, split_on])
                te_grp_unq = set(meta.loc[te_id, split_on])
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.')
                lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
                lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.')

        # Convet to df
        # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
        # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T 
        # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
        tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
        vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
        te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ]))

        # Dump
        tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
        vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
        te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False )
        
        # Plot target dist only for the 1-fold case
        # TODO: consider to plot dist for all k-fold where k>1
        if cv_folds==1 and fold==0:
            plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
            plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
            plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
            
            plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
                              title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
            
            # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
            # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')
            # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')            
    """

    #     # -----------------------------------------------
    #     #       Train-test split
    #     # -----------------------------------------------
    #     np.random.seed(SEED)
    #     idx_vec = np.random.permutation(xdata.shape[0])
    #
    #     if te_method is not None:
    #         lg.logger.info('\nSplit train/test.')
    #         te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size,
    #                                   mltype=mltype, shuffle=False, random_state=SEED)
    #
    #         te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None
    #         if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp)
    #
    #         # Split train/test
    #         tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp))
    #         tr_id = idx_vec[tr_id] # adjust the indices!
    #         te_id = idx_vec[te_id] # adjust the indices!
    #
    #         pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] )
    #         pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] )
    #
    #         lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] ))
    #         lg.logger.info('Test:  {:.1f}'.format( len(te_id)/xdata.shape[0] ))
    #
    #         # Update the master idx vector for the CV splits
    #         idx_vec = tr_id
    #
    #         # Plot dist of responses (TODO: this can be done to all response metrics)
    #         # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values,
    #         #         title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png')
    #
    #         # Confirm that group splits are correct
    #         if te_method=='group' and grp_by_col is not None:
    #             tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
    #             te_grp_unq = set(meta.loc[te_id, grp_by_col])
    #             lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.')
    #             lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.')
    #
    #         # Update vl_size to effective vl_size
    #         vl_size = vl_size * xdata.shape[0]/len(tr_id)
    #
    #         # Plot hist te
    #         pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv')
    #         plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png')
    #
    #         del tr_id, te_id
    #
    #
    #     # -----------------------------------------------
    #     #       Generate CV splits
    #     # -----------------------------------------------
    #     cv_folds_list = [1, 5, 7, 10, 15, 20, 25]
    #     lg.logger.info(f'\nStart CV splits ...')
    #
    #     for cv_folds in cv_folds_list:
    #         lg.logger.info(f'\nCV folds: {cv_folds}')
    #
    #         cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size,
    #                          mltype=mltype, shuffle=False, random_state=SEED)
    #
    #         cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None
    #         if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp)
    #
    #         tr_folds = {}
    #         vl_folds = {}
    #
    #         # Start CV iters
    #         for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)):
    #             tr_id = idx_vec[tr_id] # adjust the indices!
    #             vl_id = idx_vec[vl_id] # adjust the indices!
    #
    #             tr_folds[fold] = tr_id.tolist()
    #             vl_folds[fold] = vl_id.tolist()
    #
    #             # Confirm that group splits are correct
    #             if cv_method=='group' and grp_by_col is not None:
    #                 tr_grp_unq = set(meta.loc[tr_id, grp_by_col])
    #                 vl_grp_unq = set(meta.loc[vl_id, grp_by_col])
    #                 lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.')
    #                 lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.')
    #                 lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.')
    #
    #         # Convet to df
    #         # from_dict takes too long  -->  faster described here: stackoverflow.com/questions/19736080/
    #         # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T
    #         # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T
    #         tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ]))
    #         vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ]))
    #
    #         # Dump
    #         tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False )
    #         vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False )
    #
    #         # Plot target dist only for the 1-fold case
    #         if cv_folds==1 and fold==0:
    #             plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png')
    #             plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png')
    #
    #             plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'],
    #                               title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png')
    #
    #             pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv')
    #             pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv')

    lg.kill_logger()
    print('Done.')
Beispiel #8
0
def main(path, decil, color):
    dic = unpacking_reading.read_ufs(path)
    for i in range(len(dic[color])):
        plotting.plot_hist(dic, decil, color, i)