def data_splitter( n_splits=1, gout=None, outfigs=None, ydata=None, print_fn=print, **kwargs): """ This func calls get_single_splits() a total of n_splits times to generate multiple train/val/test splits. Args: n_splits : number of splits gout : global outdir to dump the splits outfigs : outdir to dump the distributions of target variable ydata : the target variable split_on : vol name in the dataframe to use for hard (group) partition print_fn : print function Return: tr_dct, vl_dct, te_dct : tuple of split dicts """ seeds = np.random.choice(n_splits, n_splits, replace=False) # These dicts will contain the splits tr_dct = {} vl_dct = {} te_dct = {} for i, seed in enumerate( seeds ): tr_id, vl_id, te_id = gen_single_split(ydata=ydata, seed=seed, **kwargs) tr_dct[i] = tr_id vl_dct[i] = vl_id te_dct[i] = te_id # digits = len(str(n_splits)) seed_str = str(i) # f"{seed}".zfill(digits) output = '1fold_s' + seed_str if gout is not None: np.savetxt(gout/f'{output}_tr_id.txt', tr_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') np.savetxt(gout/f'{output}_vl_id.txt', vl_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') np.savetxt(gout/f'{output}_te_id.txt', te_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') if (ydata is not None) and (outfigs is not None): plot_hist(ydata[tr_id], title='Train Set Histogram', fit=None, bins=100, path=outfigs/f'{output}_y_hist_train.png') plot_hist(ydata[vl_id], title='Val Set Histogram', fit=None, bins=100, path=outfigs/f'{output}_y_hist_val.png') plot_hist(ydata[te_id], title='Test Set Histogram', fit=None, bins=100, path=outfigs/f'{output}_y_hist_test.png') return (tr_dct, vl_dct, te_dct)
def run(args): te_size = verify_size(args.te_size) # Path appdir = MAIN_APPDIR / args.appname # import ipdb; ipdb.set_trace(context=11) # Hard split split_on = None if args.split_on is None else args.split_on.upper() te_method = args.cv_method # Specify ML task (regression or classification) if args.cv_method == 'strat': mltype = 'cls' # cast mltype to cls in case of stratification else: mltype = args.ml_task # ----------------------------------------------- # Create appdir # ----------------------------------------------- gout = appdir / 'splits' outfigs = gout / 'outfigs' os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / 'data.splitter.log') print_fn = get_print_func(lg.logger) print_fn(f'File dir: {fdir}') print_fn(f'\n{pformat(vars(args))}') dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt') # dump args # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn('\nLoad master dataset.') data = load_data(appdir / 'annotations.csv') print_fn('data.shape {}'.format(data.shape)) GE_LEN = sum([1 for c in data.columns if c.startswith('ge_')]) DD_LEN = sum([1 for c in data.columns if c.startswith('dd_')]) # import ipdb; ipdb.set_trace(context=11) # ----------------------------------------------- # Determine the dataset # ----------------------------------------------- ydata = data[args.trg_name] if args.trg_name in data.columns else None if (ydata is None) and (args.cv_method == 'strat'): raise ValueError( 'Y data must be specified if splits needs to be stratified.') if ydata is not None: plot_hist(ydata, title=f'{args.trg_name}', fit=None, bins=100, path=outfigs / f'{args.trg_name}_hist_all.png') # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn('\n{}'.format('-' * 50)) print_fn('Split into hold-out train/val/test') print_fn('{}'.format('-' * 50)) kwargs = { 'data': data, 'cv_method': args.cv_method, 'te_method': te_method, 'te_size': te_size, 'mltype': mltype, 'split_on': split_on } data_splitter(n_splits=args.n_splits, gout=gout, outfigs=outfigs, ydata=ydata, print_fn=print_fn, **kwargs) lg.kill_logger()
def data_splitter( data, n_splits: int = 1, gout: Optional[Any] = None, outfigs: Optional[Any] = None, # ydata=None, target_name: Optional[str] = None, print_fn=print, seed: Optional[int] = None, **kwargs) -> Tuple[dict, dict, dict]: """ This func calls get_single_splits() a total of n_splits times to generate multiple train/val/test splits. Args: n_splits : number of splits gout : global outdir to dump the splits outfigs : outdir to dump the distributions of target variable ydata : the target variable (array-like) split_on : vol name in the dataframe to use for hard (group) partition print_fn : print function Return: tr_dct, vl_dct, te_dct : tuple of split dicts """ import ipdb ipdb.set_trace() # np.random.seed(seed) seeds = np.random.choice(n_splits, n_splits, replace=False) # These dicts will contain the splits tr_dct = {} vl_dct = {} te_dct = {} for i, seed in enumerate(seeds): # tr_id, vl_id, te_id = gen_single_split(ydata=ydata, seed=seed, **kwargs) tr_id, vl_id, te_id = gen_single_split(data, target_name=target_name, seed=seed, **kwargs) tr_dct[i] = tr_id vl_dct[i] = vl_id te_dct[i] = te_id # digits = len(str(n_splits)) seed_str = str(i) # f"{seed}".zfill(digits) output = '1fold_s' + seed_str if gout is not None: np.savetxt(gout / f'{output}_tr_id.txt', tr_id, fmt='%d', delimiter='', newline='\n') np.savetxt(gout / f'{output}_vl_id.txt', vl_id, fmt='%d', delimiter='', newline='\n') np.savetxt(gout / f'{output}_te_id.txt', te_id, fmt='%d', delimiter='', newline='\n') # if (ydata is not None) and (outfigs is not None): if (target_name in data.columns) and (outfigs is not None): ydata = data[target_name] plot_hist(ydata[tr_id], title='Train Set Histogram', fit=None, bins=100, path=outfigs / f'{output}_y_hist_train.png') plot_hist(ydata[vl_id], title='Val Set Histogram', fit=None, bins=100, path=outfigs / f'{output}_y_hist_val.png') plot_hist(ydata[te_id], title='Test Set Histogram', fit=None, bins=100, path=outfigs / f'{output}_y_hist_test.png') import ipdb ipdb.set_trace() # print(te_dct[0]) # print(te_dct[1]) print(te_dct[0] == te_dct[1]) return (tr_dct, vl_dct, te_dct)
def run(args): t0 = time() n_splits = int(args.n_splits) te_size = verify_size(args.te_size) # te_size = args['te_size'] datapath = Path(args.datapath).resolve() # Hard split split_on = None if args.split_on is None else args.split_on.upper() cv_method = args.cv_method te_method = cv_method # Specify ML task (regression or classification) if cv_method == 'strat': mltype = 'cls' # cast mltype to cls in case of stratification else: mltype = args.ml_task # Target column name trg_name = str(args.trg_name) # ----------------------------------------------- # Create outdir # ----------------------------------------------- if args.gout is not None: gout = Path(args.gout).resolve() gout = gout / datapath.with_suffix('.splits').name else: # Note! useful for drug response # sufx = 'none' if split_on is None else split_on # gout = gout / f'split_on_{sufx}' gout = datapath.with_suffix('.splits') outfigs = gout / 'outfigs' os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / 'data.splitter.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(vars(args))}') dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt') # dump args # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn('\nLoad master dataset.') data = load_data(datapath) print_fn('data.shape {}'.format(data.shape)) ydata = data[trg_name] if trg_name in data.columns else None if (ydata is None) and (cv_method == 'strat'): raise ValueError( 'Y data must be available if splits are required to stratified.') if ydata is not None: plot_hist(ydata, title=f'{trg_name}', fit=None, bins=100, path=outfigs / f'{trg_name}_hist_all.png') # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn('\n{}'.format('-' * 50)) print_fn('Split into hold-out train/val/test') print_fn('{}'.format('-' * 50)) kwargs = { 'data': data, 'cv_method': cv_method, 'te_method': te_method, 'te_size': te_size, 'mltype': mltype, 'split_on': split_on } data_splitter(n_splits=n_splits, gout=gout, outfigs=outfigs, ydata=ydata, print_fn=print_fn, **kwargs) print_fn('Runtime: {:.1f} min'.format((time() - t0) / 60)) print_fn('Done.') lg.kill_logger()
def run(args): print("\nInput args:") pprint(vars(args)) t0 = time() te_size = verify_size(args.te_size) datapath = Path(args.datapath).resolve() # Hard split # split_on = None if args.split_on is None else args.split_on.upper() cv_method = args.cv_method te_method = cv_method # Specify ML task (regression or classification) if cv_method == "strat": mltask = "cls" # cast mltask to cls in case of stratification else: mltask = args.ml_task # Target column name trg_name = str(args.trg_name) # assert args.trg_name in data.columns, f'The prediction target ({args.name}) \ # was not found in the dataset.' # import ipdb; ipdb.set_trace() # ----------------------------------------------- # Create outdir # ----------------------------------------------- if args.gout is not None: gout = Path(args.gout).resolve() sufx = "none" if args.split_on is None else args.split_on gout = gout / datapath.with_suffix(".splits") if args.split_on is not None: gout = gout / f"split_on_{sufx}" else: gout = gout / f"split_on_none" else: # Note! useful for drug response sufx = "none" if args.split_on is None else args.split_on gout = datapath.with_suffix(".splits") outfigs = gout / "outfigs" os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / "data.splitter.log") print_fn = get_print_func(lg.logger) print_fn(f"File path: {fdir}") print_fn(f"\n{pformat(vars(args))}") dump_dict(vars(args), outpath=gout / "data.splitter.args.txt") # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn("\nLoad master dataset.") data = load_data(datapath) print_fn("data.shape {}".format(data.shape)) # ydata = data[trg_name] if trg_name in data.columns else None # if (cv_method == "strat") and (ydata is None): # raise ValueError("Prediction target column must be available if splits need to be stratified.") if (cv_method == "strat") and (trg_name not in data.columns): raise ValueError( "Prediction target column must be available if splits need to be stratified." ) # if ydata is not None: # plot_hist(ydata, title=f"{trg_name}", fit=None, bins=100, # path=outfigs/f"{trg_name}_hist_all.png") if trg_name in data.columns: plot_hist(data[trg_name], title=f"{trg_name}", fit=None, bins=100, path=outfigs / f"{trg_name}_hist_all.png") # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn("\n{}".format("-" * 50)) print_fn("Split data into hold-out train/val/test") print_fn("{}".format("-" * 50)) kwargs = { "cv_method": cv_method, "te_method": te_method, "te_size": te_size, "mltask": mltask, "split_on": args.split_on } data_splitter( data=data, n_splits=args.n_splits, gout=gout, outfigs=outfigs, # ydata = ydata, target_name=trg_name, print_fn=print_fn, seed=seed, **kwargs) print_fn("Runtime: {:.1f} min".format((time() - t0) / 60)) print_fn("Done.") lg.close_logger()