def compute_target_distribution(labels, plot=False, verbose=False): distribution = pd.DataFrame(labels).sum(axis=0) if verbose: print('target feature representation') print('-----------------------------') print(distribution.describe()) print('-----------------------------') if plot: plot_hist(distribution, 'target feature representation distribution') return distribution
print( 'Agent: {}, avg reward: {:0.2f}, avg action: {:0.2f}, learning rate: {:0.4f}, batch: {}' .format( i, np.mean(test_data[3][i]), np.mean(test_data[2][i]), *agent.sess.run([agent.learning_rate, agent.global_step]))) # Batch train agent agent_data = [ train_data[0], train_data[1], train_data[2][i], train_data[3][i] ] agent.batch_train(agent_data, save_every=99999999) agent.epsilon = agent.epsilon * 0.95 plotting.plot_hist(actions[0], xticks=np.arange(len(env.action_space)), show=False, save_path='../plots/monopolist_actions.png') lines_to_plot = OrderedDict([('Monopolist outcome', theoretical_opt[0]['profit'])]) plotting.plot_rewards(avg_rewards[0], lines=lines_to_plot, show=False, save_path='../plots/monopolist_rewards.png') # plotting.plot_signals(signals=[task.episode_observations[0][4]], intercept=task.episode_realized_intercepts[0][3], # slope=1, show=False, save_path='../plots/signals.png') # some_intercepts = np.concatenate([i for i in task.episode_realized_intercepts]) # plotting.plot_demand_curves(some_intercepts[:1000], slope=1, xmax=15, ymax=15, # show=False, save_path='../plots/demand_curves.png')
def run(args): dirpath = Path(args['dirpath']) # Data splits # te_method = args['te_method'] # cv_method = args['cv_method'] # te_size = split_size(args['te_size']) vl_size = split_size(args['vl_size']) # Features cell_fea = args['cell_fea'] drug_fea = args['drug_fea'] fea_list = cell_fea + drug_fea # Other params n_jobs = args['n_jobs'] # Hard split # grp_by_col = None split_on = args['split_on'] if args['split_on'] is None else args['split_on'].upper() cv_method = 'simple' if split_on is None else 'group' te_method = cv_method # TODO: this needs to be improved mltype = 'reg' # required for the splits (stratify in case of classification) # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = Path( str(dirpath) + '_splits' ) os.makedirs(outdir, exist_ok=True) lg = Logger(outdir/'splitter.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file dump_dict(args, outpath=outdir/'args.txt') # ----------------------------------------------- # Load and break data # ----------------------------------------------- lg.logger.info('\nLoad master dataset.') files = list(dirpath.glob('**/*.parquet')) if len(files) > 0: data = pd.read_parquet( files[0], engine='auto', columns=None ) # TODO: assumes that there is only one data file lg.logger.info('data.shape {}'.format(data.shape)) # Split features and traget, and dump to file lg.logger.info('\nSplit features and meta.') xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_') meta = data.drop(columns=xdata.columns) xdata.to_parquet( outdir/'xdata.parquet' ) meta.to_parquet( outdir/'meta.parquet' ) lg.logger.info('Total DD: {}'.format( len([c for c in xdata.columns if 'DD_' in c]) )) lg.logger.info('Total GE: {}'.format( len([c for c in xdata.columns if 'GE_' in c]) )) lg.logger.info('Unique cells: {}'.format( meta['CELL'].nunique() )) lg.logger.info('Unique drugs: {}'.format( meta['DRUG'].nunique() )) # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger) plot_hist(meta['AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_all.png') # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- """ np.random.seed(SEED) idx_vec = np.random.permutation(xdata.shape[0]) cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') # Convert vl_size into int # vl_size_int = int(vl_size*len(idx_vec)) # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=SEED) # Command meta[split_on].values[idx_vec] returns the vector meta[split_on].values # in an order specified by idx_vec # For example: # aa = meta[split_on][:3] # print(aa.values) # print(aa.values[[0,2,1]]) # m = meta[split_on] cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds = {} vl_folds = {} te_folds = {} # Start CV iters for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold+1}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # t = meta.loc[tr_id, split_on] # v = meta.loc[vl_id, split_on] # print(len(vl_id)/len(idx_vec)) # ----------------- # Store tr ids tr_folds[fold] = tr_id.tolist() # Create splitter that splits vl into vl and te (splits by half) te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=0.5, mltype=mltype, shuffle=False, random_state=SEED) # Update the index array idx_vec_ = vl_id; del vl_id te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split vl set into vl and te vl_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) vl_id = idx_vec_[vl_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # v = meta.loc[vl_id, split_on] # e = meta.loc[te_id, split_on] # Store vl and te ids vl_folds[fold] = vl_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] )) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] )) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] )) # Confirm that group splits are correct if split_on is not None: tr_grp_unq = set(meta.loc[tr_id, split_on]) vl_grp_unq = set(meta.loc[vl_id, split_on]) te_grp_unq = set(meta.loc[te_id, split_on]) lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ])) # Dump tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False ) # Plot target dist only for the 1-fold case # TODO: consider to plot dist for all k-fold where k>1 if cv_folds==1 and fold==0: plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') """ # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- np.random.seed(SEED) idx_vec = np.random.permutation(xdata.shape[0]) cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=SEED) cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds = {} vl_folds = {} te_folds = {} # Start CV iters (this for loop generates the tr and vl splits) for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # ----------------- # Store vl ids vl_folds[fold] = vl_id.tolist() # Update te_size to the new full size of available samples if cv_folds == 1: te_size_ = vl_size / (1 - vl_size) else: te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec)) # Create splitter that splits tr into tr and te te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_, mltype=mltype, shuffle=False, random_state=SEED) # Update the index array idx_vec_ = tr_id; del tr_id te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # Store tr and te ids tr_folds[fold] = tr_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] )) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] )) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] )) # Confirm that group splits are correct if split_on is not None: tr_grp_unq = set(meta.loc[tr_id, split_on]) vl_grp_unq = set(meta.loc[vl_id, split_on]) te_grp_unq = set(meta.loc[te_id, split_on]) lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ])) # Dump tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False ) # Plot target dist only for the 1-fold case # TODO: consider to plot dist for all k-fold where k>1 if cv_folds==1 and fold==0: plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') # # ----------------------------------------------- # # Train-test split # # ----------------------------------------------- # np.random.seed(SEED) # idx_vec = np.random.permutation(xdata.shape[0]) # # if te_method is not None: # lg.logger.info('\nSplit train/test.') # te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size, # mltype=mltype, shuffle=False, random_state=SEED) # # te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None # if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # # # Split train/test # tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) # tr_id = idx_vec[tr_id] # adjust the indices! # te_id = idx_vec[te_id] # adjust the indices! # # pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] ) # pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] ) # # lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] )) # lg.logger.info('Test: {:.1f}'.format( len(te_id)/xdata.shape[0] )) # # # Update the master idx vector for the CV splits # idx_vec = tr_id # # # Plot dist of responses (TODO: this can be done to all response metrics) # # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values, # # title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png') # # # Confirm that group splits are correct # if te_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # te_grp_unq = set(meta.loc[te_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') # lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.') # # # Update vl_size to effective vl_size # vl_size = vl_size * xdata.shape[0]/len(tr_id) # # # Plot hist te # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') # plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') # # del tr_id, te_id # # # # ----------------------------------------------- # # Generate CV splits # # ----------------------------------------------- # cv_folds_list = [1, 5, 7, 10, 15, 20, 25] # lg.logger.info(f'\nStart CV splits ...') # # for cv_folds in cv_folds_list: # lg.logger.info(f'\nCV folds: {cv_folds}') # # cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, # mltype=mltype, shuffle=False, random_state=SEED) # # cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None # if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) # # tr_folds = {} # vl_folds = {} # # # Start CV iters # for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): # tr_id = idx_vec[tr_id] # adjust the indices! # vl_id = idx_vec[vl_id] # adjust the indices! # # tr_folds[fold] = tr_id.tolist() # vl_folds[fold] = vl_id.tolist() # # # Confirm that group splits are correct # if cv_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # vl_grp_unq = set(meta.loc[vl_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') # lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') # lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') # # # Convet to df # # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T # tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) # vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) # # # Dump # tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) # vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) # # # Plot target dist only for the 1-fold case # if cv_folds==1 and fold==0: # plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') # plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') # # plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], # title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') lg.kill_logger() print('Done.')
def trn_learning_curve( self, framework: str = 'lightgbm', mltype: str = 'reg', model_name: str = 'lgb_reg', # TODO! this is redundent init_kwargs: dict = {}, fit_kwargs: dict = {}, clr_keras_kwargs: dict = {}, metrics: list = [ 'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error', 'neg_mean_squared_error' ], n_jobs: int = 4, random_state: int = None, plot=True): """ Args: framework : ml framework (keras, lightgbm, or sklearn) mltype : type to ml problem (reg or cls) init_kwargs : dict of parameters that initialize the estimator fit_kwargs : dict of parameters to the estimator's fit() method clr_keras_kwargs : metrics : allow to pass a string of metrics TODO! """ self.framework = framework self.mltype = mltype self.model_name = model_name self.init_kwargs = init_kwargs self.fit_kwargs = fit_kwargs self.clr_keras_kwargs = clr_keras_kwargs self.metrics = metrics self.n_jobs = n_jobs self.random_state = random_state # Start nested loop of train size and cv folds tr_scores_all = [] # list of dicts vl_scores_all = [] # list of dicts te_scores_all = [] # list of dicts # Record runtime per shard runtime_records = [] # CV loop for fold, (tr_k, vl_k, te_k) in enumerate( zip(self.tr_dct.keys(), self.vl_dct.keys(), self.te_dct.keys())): fold = fold + 1 if self.logger is not None: self.logger.info(f'Fold {fold}/{self.cv_folds}') # Get the indices for this fold tr_id = self.tr_dct[tr_k] vl_id = self.vl_dct[vl_k] te_id = self.te_dct[te_k] # Samples from this dataset are randomly sampled for training xtr = self.X[tr_id, :] # ytr = self.Y[tr_id, :] ytr = np.squeeze(self.Y[tr_id, :]) # A fixed set of val samples for the current CV split xvl = self.X[vl_id, :] yvl = np.squeeze(self.Y[vl_id, :]) # A fixed set of test samples for the current CV split xte = self.X[te_id, :] yte = np.squeeze(self.Y[te_id, :]) # Shards loop (iterate across the dataset sizes and train) """ np.random.seed(random_state) idx = np.random.permutation(len(xtr)) Note that we don't shuffle the dataset another time using the commands above. """ idx = np.arange(len(xtr)) for i, tr_sz in enumerate(self.tr_shards): # For each shard: train model, save best model, calc tr_scores, calc_vl_scores if self.logger: self.logger.info( f'\tTrain size: {tr_sz} ({i+1}/{len(self.tr_shards)})') # Sequentially get a subset of samples (the input dataset X must be shuffled) xtr_sub = xtr[idx[:tr_sz], :] # ytr_sub = np.squeeze(ytr[idx[:tr_sz], :]) ytr_sub = ytr[idx[:tr_sz]] # Get the estimator estimator = ml_models.get_model(self.model_name, init_kwargs=self.init_kwargs) model = estimator.model # Train # self.val_split = 0 # 0.1 # used for early stopping #self.eval_frac = 0.1 # 0.1 # used for early stopping #eval_samples = int(self.eval_frac * xvl.shape[0]) #eval_set = (xvl[:eval_samples, :], yvl[:eval_samples]) # we don't random sample; the same eval_set is used for early stopping eval_set = (xvl, yvl) if self.framework == 'lightgbm': model, trn_outdir, runtime = self.trn_lgbm_model( model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=eval_set) elif self.framework == 'sklearn': model, trn_outdir, runtime = self.trn_sklearn_model( model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=None) elif self.framework == 'keras': model, trn_outdir, runtime = self.trn_keras_model( model=model, xtr_sub=xtr_sub, ytr_sub=ytr_sub, fold=fold, tr_sz=tr_sz, eval_set=eval_set) elif self.framework == 'pytorch': pass else: raise ValueError( f'Framework {self.framework} is not supported.') # Save plot of target distribution plot_hist(ytr_sub, var_name=f'Target (Train size={tr_sz})', fit=None, bins=100, path=trn_outdir / 'target_hist_tr.png') plot_hist(yvl, var_name=f'Target (Val size={len(yvl)})', fit=None, bins=100, path=trn_outdir / 'target_hist_vl.png') plot_hist(yte, var_name=f'Target (Test size={len(yte)})', fit=None, bins=100, path=trn_outdir / 'target_hist_te.png') # Calc preds and scores TODO: dump preds # ... training set y_pred, y_true = calc_preds(model, x=xtr_sub, y=ytr_sub, mltype=self.mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) tr_scores['y_avg'] = np.mean(y_pred) # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=self.mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) vl_scores['y_avg'] = np.mean(y_pred) # ... test set y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=self.mltype) te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=self.mltype, metrics=None) te_scores['y_avg'] = np.mean(y_pred) del estimator, model # Save predictions (need to include metadata) # TODO pass # Store runtime runtime_records.append((fold, tr_sz, runtime)) # Add metadata # tr_scores['tr_set'] = True tr_scores['set'] = 'tr' tr_scores['fold'] = 'fold' + str(fold) tr_scores['tr_size'] = tr_sz # vl_scores['tr_set'] = False vl_scores['set'] = 'vl' vl_scores['fold'] = 'fold' + str(fold) vl_scores['tr_size'] = tr_sz # te_scores['tr_set'] = False te_scores['set'] = 'te' te_scores['fold'] = 'fold' + str(fold) te_scores['tr_size'] = tr_sz # Append scores (dicts) tr_scores_all.append(tr_scores) vl_scores_all.append(vl_scores) te_scores_all.append(te_scores) # Dump intermediate scores # TODO: test this! scores_tmp = pd.concat([ scores_to_df([tr_scores]), scores_to_df([vl_scores]), scores_to_df([te_scores]) ], axis=0) scores_tmp.to_csv(trn_outdir / ('scores_tmp.csv'), index=False) del trn_outdir, scores_tmp # Dump intermediate results (this is useful if the run terminates before run ends) scores_all_df_tmp = pd.concat([ scores_to_df(tr_scores_all), scores_to_df(vl_scores_all), scores_to_df(te_scores_all) ], axis=0) scores_all_df_tmp.to_csv( self.outdir / ('_lrn_crv_scores_cv' + str(fold) + '.csv'), index=False) # Scores to df tr_scores_df = scores_to_df(tr_scores_all) vl_scores_df = scores_to_df(vl_scores_all) te_scores_df = scores_to_df(te_scores_all) scores_df = pd.concat([tr_scores_df, vl_scores_df, te_scores_df], axis=0) # Dump final results tr_scores_df.to_csv(self.outdir / 'tr_lrn_crv_scores.csv', index=False) vl_scores_df.to_csv(self.outdir / 'vl_lrn_crv_scores.csv', index=False) te_scores_df.to_csv(self.outdir / 'te_lrn_crv_scores.csv', index=False) scores_df.to_csv(self.outdir / 'lrn_crv_scores.csv', index=False) # Runtime df runtime_df = pd.DataFrame.from_records( runtime_records, columns=['fold', 'tr_sz', 'time']) runtime_df.to_csv(self.outdir / 'runtime.csv', index=False) # Plot learning curves if plot: plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir) plot_lrn_crv_all_metrics(scores_df, outdir=self.outdir, xtick_scale='log2', ytick_scale='log2') plot_runtime(runtime_df, outdir=self.outdir, xtick_scale='log2', ytick_scale='log2') return scores_df
def main(cfg): # setting up output directories, and writing to stdout make_dirs(cfg.stdout_dir, replace=False) if cfg.train: run_type = 'train' else: if 'weight' in cfg.prune_type.lower(): run_type = 'weight-prune' else: run_type = 'unit-prune' sys.stdout = open( '{}/stdout_{}_{}.txt'.format(cfg.stdout_dir, cfg.model_name, run_type), 'w') print(cfg) print('\n') sys.stdout.flush() # if train mode, replace the previous plot and ckpt directories; if in prune mode, use existing directories if cfg.plot: make_dirs(os.path.join(cfg.plot_dir, cfg.model_name), replace=cfg.train) if cfg.save_model: make_dirs(os.path.join(cfg.model_dir, cfg.model_name), replace=cfg.train) # set random seed if cfg.random_seed != 0: random_seed = cfg.random_seed else: random_seed = random.randint(1, 100000) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) # set device as cuda or cpu if cfg.use_gpu and torch.cuda.is_available(): # reproducibility using cuda torch.cuda.manual_seed(random_seed) cudnn.deterministic = True cudnn.benchmark = False device = torch.device('cuda') else: device = torch.device('cpu') if cfg.use_gpu: print('gpu option was to <True>, but no cuda device was found') print('\n') # datasets and dataloaders # normalizing training and validation images to [0, 1] suffices for the purposes of our research objective # in training, <drop_last> minibatch in an epoch set to <True> for simplicity in tracking training performance dataset_train = MNIST(root='./data/mnist', train=True, download=True, transform=transforms.Compose([transforms.ToTensor() ]), target_transform=None) dataloader_train = DataLoader(dataset=dataset_train, batch_size=cfg.batch_size, shuffle=cfg.shuffle, num_workers=cfg.num_workers, pin_memory=True, drop_last=True) dataset_val = MNIST(root='./data/mnist', train=False, download=True, transform=transforms.Compose([transforms.ToTensor()]), target_transform=None) dataloader_val = DataLoader(dataset=dataset_val, batch_size=100, shuffle=False, num_workers=cfg.num_workers, pin_memory=True, drop_last=False) # automatically compute number of classes targets = np.asarray(dataset_train.targets) c = np.unique(targets).shape[0] # define model # weights initialized using Kaiming uniform (He initialization) # number of units per hidden layer is passed in as an argument net = Net(np.product(cfg.img_size), c, cfg.units).to(device) criterion = nn.CrossEntropyLoss() if cfg.train: # training mode if cfg.use_sgd: optimizer = optim.SGD(params=net.parameters(), lr=cfg.lr, momentum=cfg.momentum, nesterov=cfg.use_nesterov) else: optimizer = optim.Adam(params=net.parameters(), lr=cfg.lr, betas=(cfg.beta1, cfg.beta2)) # tracking training and validation stats over epochs epochs = [] train_loss_epochs, val_loss_epochs = [], [] train_acc_epochs, val_acc_epochs = [], [] # best model is defined as model with best performing validation loss best_loss = float('inf') for epoch in range(cfg.epochs): # tracking training and validation stats over a given epoch train_loss_epoch, val_loss_epoch = [], [] train_acc_epoch, val_acc_epoch = [], [] # training set for i, (x, y) in enumerate(dataloader_train): x, y = x.to(device), y.to(device) optimizer.zero_grad() logits = net(x) loss = criterion(logits, y) loss.backward() optimizer.step() acc = calculate_acc(logits, y) append((train_loss_epoch, loss.item()), (train_acc_epoch, acc.item())) # validation set with torch.no_grad(): for i, (x, y) in enumerate(dataloader_val): x, y = x.to(device), y.to(device) logits = net(x) loss = criterion(logits, y) acc = calculate_acc(logits, y) append((val_loss_epoch, loss.item()), (val_acc_epoch, acc.item())) train_loss_epoch, val_loss_epoch = get_average( train_loss_epoch), get_average(val_loss_epoch) train_acc_epoch, val_acc_epoch = get_average( train_acc_epoch), get_average(val_acc_epoch) print('train_epoch{:0=3d}_loss{:.4f}_acc{:.4f}'.format( epoch + 1, train_loss_epoch, train_acc_epoch)) print('valid_epoch{:0=3d}_loss{:.4f}_acc{:.4f}'.format( epoch + 1, val_loss_epoch, val_acc_epoch)) print('\n') sys.stdout.flush() if cfg.plot: append((epochs, epoch + 1), (train_loss_epochs, train_loss_epoch), (val_loss_epochs, val_loss_epoch), (train_acc_epochs, train_acc_epoch), (val_acc_epochs, val_acc_epoch)) plot_line(epochs, train_loss_epochs, val_loss_epochs, 'Epoch Number', 'Loss', cfg) plot_line(epochs, train_acc_epochs, val_acc_epochs, 'Epoch Number', 'Accuracy', cfg) if val_loss_epoch < best_loss: best_loss = val_loss_epoch print('New best model at epoch {:0=3d} with val_loss {:.4f}'. format(epoch + 1, best_loss)) print('\n') if cfg.save_model: # save model when validation loss improves save_name = '{}_net_epoch{:0=3d}_val_loss{:.4f}'.format( cfg.model_name, epoch + 1, best_loss) torch.save( net.state_dict(), os.path.join(cfg.model_dir, cfg.model_name, '{}.pth'.format(save_name))) with open( os.path.join(cfg.model_dir, cfg.model_name, '{}.txt'.format(cfg.model_name)), 'w') as file: file.write('{}.pth'.format(save_name)) else: # pruning mode # checks on arguments passed in for k in cfg.sparsity: assert 0 <= k <= 1 if cfg.use_sparse_mul: assert cfg.to_sparse # load model with open( os.path.join(cfg.model_dir, cfg.model_name, '{}.txt'.format(cfg.model_name)), 'r') as file: load_name = file.readline() net.load_state_dict( torch.load( os.path.join(cfg.model_dir, cfg.model_name, '{}'.format(load_name)))) net.eval() # select pruning approach to use if 'weight' in cfg.prune_type.lower(): prune = weight_prune else: prune = unit_prune sparsities = [] val_loss_sparse, val_acc_sparse = [], [] time_sparsities = [] for k in cfg.sparsity: val_loss_k, val_acc_k = [], [] time_k = [] # copy network so that the sparsity changes are not additive for each k net_sparse = copy.deepcopy(net) pruned_weights = [] # prune model, except for the last layer for (i, p) in enumerate(net_sparse.parameters()): if i < len(cfg.units): original_weights = copy.deepcopy(p.data) if cfg.plot: # plot magnitude of original weights (for comparison to post-pruned weights) plot_hist([ torch.abs( original_weights.flatten()).cpu().numpy() ], ['b'], cfg.prune_type, i + 1, k, 'Non-Pruned Weight Magnitudes', 'Counts', cfg) prune(p.data, k) if cfg.plot: # plot original magnitudes of pruned weights, and magnitudes of remaining weights, separately pruned_weights_non_zero = torch.abs( original_weights.flatten()[p.data.flatten() != 0]) pruned_weights_zeroed = torch.abs( original_weights.flatten()[p.data.flatten() == 0]) plot_hist([ pruned_weights_non_zero.cpu().numpy(), pruned_weights_zeroed.cpu().numpy() ], ['g', 'r'], cfg.prune_type, i + 1, k, 'Weight Magnitudes', 'Counts', cfg) plot_hist([pruned_weights_non_zero.cpu().numpy()], ['k'], cfg.prune_type, i + 1, k, 'Surviving Weight Magnitudes', 'Counts', cfg) if cfg.to_sparse and i < len(cfg.units): pruned_weights.append(p.data.to_sparse()) else: pruned_weights.append(p.data) with torch.no_grad(): for i, (x, y) in enumerate(dataloader_val): x, y = x.to(device), y.to(device) start = time.time() logits = forward(x, pruned_weights, cfg.use_sparse_mul) end = time.time() loss = criterion(logits, y) acc = calculate_acc(logits, y) append((val_loss_k, loss.item()), (val_acc_k, acc.item()), (time_k, end - start)) val_loss_k, val_acc_k, time_k = get_average( val_loss_k), get_average(val_acc_k), get_average(time_k) print('valid_{}_k{:.2f}_loss{:.4f}_acc{:.4f}'.format( run_type, k, val_loss_k, val_acc_k)) print('valid_{}_k{:.2f}_time/minibatch{:.6f}'.format( run_type, k, time_k)) print('\n') sys.stdout.flush() if cfg.plot: append((sparsities, k), (val_loss_sparse, val_loss_k), (val_acc_sparse, val_acc_k), (time_sparsities, time_k)) plot_line(sparsities, [], val_loss_sparse, 'Sparsity {} Prune'.format(cfg.prune_type), 'Loss', cfg) plot_line(sparsities, [], val_acc_sparse, 'Sparsity {} Prune'.format(cfg.prune_type), 'Accuracy', cfg) plot_line(sparsities, [], time_sparsities, 'Sparsity {} Prune'.format(cfg.prune_type), 'Time', cfg) if cfg.save_model: torch.save( net_sparse.state_dict(), os.path.join( cfg.model_dir, cfg.model_name, '{}_sparse_net_{}_val_loss{:.4f}.pth'.format( cfg.model_name, run_type, val_loss_k)))
i, np.mean(test_data[3][i]), np.mean(test_data[2][i]), *agent.sess.run([agent.learning_rate, agent.global_step]))) # Batch train agent agent_data = [ train_data[0], train_data[1], train_data[2][i], train_data[3][i] ] agent.batch_train(agent_data) agent.epsilon = agent.epsilon * 0.975 agent.save(name='duopolist_{}'.format(i), step=step) for i in range(len(agents_list)): plotting.plot_hist( actions[i], xticks=np.arange(len(env.action_space)), show=False, save_path='../plots/duopolist_{}_actions.png'.format(i)) lines_to_plot = OrderedDict([ ('Competitive outcome', duopoly_outcome_comp[i]['profit']), ('Collusive outcome', duopoly_outcome_coll[i]['profit']), ('Deviation outcome', duopoly_outcome_deviate[i]['profit']) ]) plotting.plot_rewards( avg_rewards[i], lines=lines_to_plot, show=False, save_path='../plots/duopolist_{}_rewards.png'.format(i))
def run(args): dirpath = verify_dirpath(args['dirpath']) te_size = split_size(args['te_size']) fea_list = args['cell_fea'] + args['drug_fea'] # Hard split split_on = None if args['split_on'] is None else args['split_on'].upper() cv_method = 'simple' if split_on is None else 'group' te_method = cv_method # TODO: this needs to be improved mltype = 'reg' # required for the splits (stratify in case of classification) # ----------------------------------------------- # Create (outdir and) logger # ----------------------------------------------- outdir = create_outdir(dirpath, args) args['outdir'] = str(outdir) lg = Logger(outdir / 'data_splitter_logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') dump_dict(args, outpath=outdir / 'data_splitter_args.txt') # dump args. # ----------------------------------------------- # Load and break data # ----------------------------------------------- lg.logger.info('\nLoad master dataset.') # files = list(dirpath.glob('**/*.parquet')) files = list(dirpath.glob('./*.parquet')) if len(files) > 0: data = pd.read_parquet( files[0]) # TODO: assumes that there is only one data file lg.logger.info('data.shape {}'.format(data.shape)) # Split features and traget, and dump to file lg.logger.info('\nSplit features and meta.') xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_') meta = data.drop(columns=xdata.columns) xdata.to_parquet(outdir / 'xdata.parquet') meta.to_parquet(outdir / 'meta.parquet') lg.logger.info('Total DD: {}'.format( len([c for c in xdata.columns if 'DD_' in c]))) lg.logger.info('Total GE: {}'.format( len([c for c in xdata.columns if 'GE_' in c]))) lg.logger.info('Unique cells: {}'.format(meta['CELL'].nunique())) lg.logger.info('Unique drugs: {}'.format(meta['DRUG'].nunique())) # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger) plot_hist(meta['AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_all.png') # ----------------------------------------------- # Generate Hold-Out split (train/val/test) # ----------------------------------------------- """ First, we split the data into train and test. The remaining of train set is further splitted into train and validation. """ lg.logger.info('\n{}'.format('-' * 50)) lg.logger.info('Split into hold-out train/val/test') lg.logger.info('{}'.format('-' * 50)) # Note that we don't shuffle the original dataset, but rather we create a vector array of # representative indices. np.random.seed(args['seed']) idx_vec = np.random.permutation(data.shape[0]) # Create splitter that splits the full dataset into tr and te te_folds = int(1 / te_size) te_splitter = cv_splitter(cv_method=te_method, cv_folds=te_folds, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) te_grp = None if split_on is None else meta[split_on].values[idx_vec] if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) tr_id = idx_vec[ tr_id] # adjust the indices! we'll split the remaining tr into te and vl te_id = idx_vec[te_id] # adjust the indices! # Update a vector array that excludes the test indices idx_vec_ = tr_id del tr_id # Define vl_size while considering the new full size of the available samples vl_size = te_size / (1 - te_size) cv_folds = int(1 / vl_size) # Create splitter that splits tr into tr and vl cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) cv_grp = None if split_on is None else meta[split_on].values[idx_vec_] if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) # Split tr into tr and vl tr_id, vl_id = next(cv.split(idx_vec_, groups=cv_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! vl_id = idx_vec_[vl_id] # adjust the indices! # Dump tr, vl, te indices np.savetxt(outdir / '1fold_tr_id.csv', tr_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') np.savetxt(outdir / '1fold_vl_id.csv', vl_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') np.savetxt(outdir / '1fold_te_id.csv', te_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100 * len(tr_id) / xdata.shape[0])) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100 * len(vl_id) / xdata.shape[0])) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100 * len(te_id) / xdata.shape[0])) # Confirm that group splits are correct (no intersection) grp_col = 'CELL' if split_on is None else split_on print_intersection_on_var(meta, tr_id=tr_id, vl_id=vl_id, te_id=te_id, grp_col=grp_col, logger=lg.logger) plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir / 'ytr_yvl_dist.png') # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- """ K-fold CV split is applied with multiple values of k. For each set of splits k, the dataset is divided into k splits, where each split results in train and val samples. In this process, we take the train samples, and divide them into a smaller subset of train samples and test samples. """ lg.logger.info('\n{}'.format('-' * 50)) lg.logger.info(f"Split into multiple sets k-fold splits (multiple k's)") lg.logger.info('{}'.format('-' * 50)) cv_folds_list = [5, 7, 10, 15, 20] for cv_folds in cv_folds_list: lg.logger.info(f'\n----- {cv_folds}-fold splits -----') # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) cv_grp = None if split_on is None else meta[split_on].values[idx_vec] if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds, vl_folds, te_folds = {}, {}, {} # Start CV iters (this for loop generates the tr and vl splits) for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold+1}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # ----------------- # Store vl ids vl_folds[fold] = vl_id.tolist() # Update te_size to the new full size of available samples te_size_ = len(vl_id) / len(idx_vec) / (1 - len(vl_id) / len(idx_vec)) te_folds_split = int(1 / te_size_) # Create splitter that splits tr into tr and te te_splitter = cv_splitter(cv_method=te_method, cv_folds=te_folds_split, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) # Update the index array idx_vec_ = tr_id del tr_id te_grp = None if split_on is None else meta[split_on].values[ idx_vec_] if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # Store tr and te ids tr_folds[fold] = tr_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100 * len(tr_id) / xdata.shape[0])) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100 * len(vl_id) / xdata.shape[0])) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100 * len(te_id) / xdata.shape[0])) # Confirm that group splits are correct (no intersection) grp_col = 'CELL' if split_on is None else split_on print_intersection_on_var(meta, tr_id=tr_id, vl_id=vl_id, te_id=te_id, grp_col=grp_col, logger=lg.logger) # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in tr_folds.items()])) vl_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in vl_folds.items()])) te_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in te_folds.items()])) # Dump tr_folds.to_csv(outdir / f'{cv_folds}fold_tr_id.csv', index=False) vl_folds.to_csv(outdir / f'{cv_folds}fold_vl_id.csv', index=False) te_folds.to_csv(outdir / f'{cv_folds}fold_te_id.csv', index=False) # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- """ # TODO: consider to separate the pipeline hold-out and k-fold splits! # Since we shuffled the dataset, we don't need to shuffle again. # np.random.seed(args['seed']) # idx_vec = np.random.permutation(xdata.shape[0]) idx_vec = np.array(range(xdata.shape[0])) cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=args['seed']) cv_grp = None if split_on is None else meta[split_on].values[idx_vec] if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds, vl_folds, te_folds = {}, {}, {} # Start CV iters (this for loop generates the tr and vl splits) for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # ----------------- # Store vl ids vl_folds[fold] = vl_id.tolist() # Update te_size to the new full size of available samples if cv_folds == 1: te_size_ = vl_size / (1 - vl_size) else: te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec)) # Create splitter that splits tr into tr and te te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_, mltype=mltype, shuffle=False, random_state=args['seed']) # Update the index array idx_vec_ = tr_id; del tr_id te_grp = None if split_on is None else meta[split_on].values[idx_vec_] if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # Store tr and te ids tr_folds[fold] = tr_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] )) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] )) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] )) # Confirm that group splits are correct if split_on is not None: tr_grp_unq = set(meta.loc[tr_id, split_on]) vl_grp_unq = set(meta.loc[vl_id, split_on]) te_grp_unq = set(meta.loc[te_id, split_on]) lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ])) # Dump tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False ) # Plot target dist only for the 1-fold case # TODO: consider to plot dist for all k-fold where k>1 if cv_folds==1 and fold==0: plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') """ # # ----------------------------------------------- # # Train-test split # # ----------------------------------------------- # np.random.seed(SEED) # idx_vec = np.random.permutation(xdata.shape[0]) # # if te_method is not None: # lg.logger.info('\nSplit train/test.') # te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size, # mltype=mltype, shuffle=False, random_state=SEED) # # te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None # if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # # # Split train/test # tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) # tr_id = idx_vec[tr_id] # adjust the indices! # te_id = idx_vec[te_id] # adjust the indices! # # pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] ) # pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] ) # # lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] )) # lg.logger.info('Test: {:.1f}'.format( len(te_id)/xdata.shape[0] )) # # # Update the master idx vector for the CV splits # idx_vec = tr_id # # # Plot dist of responses (TODO: this can be done to all response metrics) # # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values, # # title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png') # # # Confirm that group splits are correct # if te_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # te_grp_unq = set(meta.loc[te_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') # lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.') # # # Update vl_size to effective vl_size # vl_size = vl_size * xdata.shape[0]/len(tr_id) # # # Plot hist te # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') # plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') # # del tr_id, te_id # # # # ----------------------------------------------- # # Generate CV splits # # ----------------------------------------------- # cv_folds_list = [1, 5, 7, 10, 15, 20, 25] # lg.logger.info(f'\nStart CV splits ...') # # for cv_folds in cv_folds_list: # lg.logger.info(f'\nCV folds: {cv_folds}') # # cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, # mltype=mltype, shuffle=False, random_state=SEED) # # cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None # if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) # # tr_folds = {} # vl_folds = {} # # # Start CV iters # for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): # tr_id = idx_vec[tr_id] # adjust the indices! # vl_id = idx_vec[vl_id] # adjust the indices! # # tr_folds[fold] = tr_id.tolist() # vl_folds[fold] = vl_id.tolist() # # # Confirm that group splits are correct # if cv_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # vl_grp_unq = set(meta.loc[vl_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') # lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') # lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') # # # Convet to df # # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T # tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) # vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) # # # Dump # tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) # vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) # # # Plot target dist only for the 1-fold case # if cv_folds==1 and fold==0: # plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') # plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') # # plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], # title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') lg.kill_logger() print('Done.')
def main(path, decil, color): dic = unpacking_reading.read_ufs(path) for i in range(len(dic[color])): plotting.plot_hist(dic, decil, color, i)