コード例 #1
0
def data_splitter( n_splits=1, gout=None, outfigs=None, ydata=None,
                   print_fn=print, **kwargs):
    """
    This func calls get_single_splits() a total of n_splits times to generate
    multiple train/val/test splits.
    Args:
        n_splits : number of splits
        gout : global outdir to dump the splits
        outfigs : outdir to dump the distributions of target variable
        ydata : the target variable
        split_on : vol name in the dataframe to use for hard (group) partition
        print_fn : print function
    Return:
        tr_dct, vl_dct, te_dct : tuple of split dicts
    """
    seeds = np.random.choice(n_splits, n_splits, replace=False)

    # These dicts will contain the splits
    tr_dct = {}
    vl_dct = {}
    te_dct = {}

    for i, seed in enumerate( seeds ):
        tr_id, vl_id, te_id = gen_single_split(ydata=ydata, seed=seed, **kwargs)

        tr_dct[i] = tr_id
        vl_dct[i] = vl_id
        te_dct[i] = te_id

        # digits = len(str(n_splits))
        seed_str = str(i)  # f"{seed}".zfill(digits)
        output = '1fold_s' + seed_str

        if gout is not None:
            np.savetxt(gout/f'{output}_tr_id.txt', tr_id.reshape(-1, 1),
                       fmt='%d', delimiter='', newline='\n')
            np.savetxt(gout/f'{output}_vl_id.txt', vl_id.reshape(-1, 1),
                       fmt='%d', delimiter='', newline='\n')
            np.savetxt(gout/f'{output}_te_id.txt', te_id.reshape(-1, 1),
                       fmt='%d', delimiter='', newline='\n')

        if (ydata is not None) and (outfigs is not None):
            plot_hist(ydata[tr_id], title='Train Set Histogram',
                      fit=None, bins=100, path=outfigs/f'{output}_y_hist_train.png')
            plot_hist(ydata[vl_id], title='Val Set Histogram',
                      fit=None, bins=100, path=outfigs/f'{output}_y_hist_val.png')
            plot_hist(ydata[te_id], title='Test Set Histogram',
                      fit=None, bins=100, path=outfigs/f'{output}_y_hist_test.png')
    return (tr_dct, vl_dct, te_dct)
コード例 #2
0
ファイル: cv_splits.py プロジェクト: adpartin/pdx-histo
def run(args):

    te_size = verify_size(args.te_size)

    # Path
    appdir = MAIN_APPDIR / args.appname

    # import ipdb; ipdb.set_trace(context=11)

    # Hard split
    split_on = None if args.split_on is None else args.split_on.upper()
    te_method = args.cv_method

    # Specify ML task (regression or classification)
    if args.cv_method == 'strat':
        mltype = 'cls'  # cast mltype to cls in case of stratification
    else:
        mltype = args.ml_task

    # -----------------------------------------------
    #       Create appdir
    # -----------------------------------------------
    gout = appdir / 'splits'
    outfigs = gout / 'outfigs'
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / 'data.splitter.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File dir: {fdir}')
    print_fn(f'\n{pformat(vars(args))}')
    dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt')  # dump args

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn('\nLoad master dataset.')
    data = load_data(appdir / 'annotations.csv')
    print_fn('data.shape {}'.format(data.shape))

    GE_LEN = sum([1 for c in data.columns if c.startswith('ge_')])
    DD_LEN = sum([1 for c in data.columns if c.startswith('dd_')])

    # import ipdb; ipdb.set_trace(context=11)

    # -----------------------------------------------
    #       Determine the dataset
    # -----------------------------------------------
    ydata = data[args.trg_name] if args.trg_name in data.columns else None
    if (ydata is None) and (args.cv_method == 'strat'):
        raise ValueError(
            'Y data must be specified if splits needs to be stratified.')
    if ydata is not None:
        plot_hist(ydata,
                  title=f'{args.trg_name}',
                  fit=None,
                  bins=100,
                  path=outfigs / f'{args.trg_name}_hist_all.png')

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn('\n{}'.format('-' * 50))
    print_fn('Split into hold-out train/val/test')
    print_fn('{}'.format('-' * 50))

    kwargs = {
        'data': data,
        'cv_method': args.cv_method,
        'te_method': te_method,
        'te_size': te_size,
        'mltype': mltype,
        'split_on': split_on
    }

    data_splitter(n_splits=args.n_splits,
                  gout=gout,
                  outfigs=outfigs,
                  ydata=ydata,
                  print_fn=print_fn,
                  **kwargs)

    lg.kill_logger()
コード例 #3
0
def data_splitter(
        data,
        n_splits: int = 1,
        gout: Optional[Any] = None,
        outfigs: Optional[Any] = None,
        # ydata=None,
        target_name: Optional[str] = None,
        print_fn=print,
        seed: Optional[int] = None,
        **kwargs) -> Tuple[dict, dict, dict]:
    """
    This func calls get_single_splits() a total of n_splits times to generate
    multiple train/val/test splits.

    Args:
        n_splits : number of splits
        gout : global outdir to dump the splits
        outfigs : outdir to dump the distributions of target variable
        ydata : the target variable (array-like)
        split_on : vol name in the dataframe to use for hard (group) partition
        print_fn : print function
    Return:
        tr_dct, vl_dct, te_dct : tuple of split dicts
    """
    import ipdb
    ipdb.set_trace()
    # np.random.seed(seed)
    seeds = np.random.choice(n_splits, n_splits, replace=False)

    # These dicts will contain the splits
    tr_dct = {}
    vl_dct = {}
    te_dct = {}

    for i, seed in enumerate(seeds):
        # tr_id, vl_id, te_id = gen_single_split(ydata=ydata, seed=seed, **kwargs)
        tr_id, vl_id, te_id = gen_single_split(data,
                                               target_name=target_name,
                                               seed=seed,
                                               **kwargs)

        tr_dct[i] = tr_id
        vl_dct[i] = vl_id
        te_dct[i] = te_id

        # digits = len(str(n_splits))
        seed_str = str(i)  # f"{seed}".zfill(digits)
        output = '1fold_s' + seed_str

        if gout is not None:
            np.savetxt(gout / f'{output}_tr_id.txt',
                       tr_id,
                       fmt='%d',
                       delimiter='',
                       newline='\n')
            np.savetxt(gout / f'{output}_vl_id.txt',
                       vl_id,
                       fmt='%d',
                       delimiter='',
                       newline='\n')
            np.savetxt(gout / f'{output}_te_id.txt',
                       te_id,
                       fmt='%d',
                       delimiter='',
                       newline='\n')

        # if (ydata is not None) and (outfigs is not None):
        if (target_name in data.columns) and (outfigs is not None):
            ydata = data[target_name]
            plot_hist(ydata[tr_id],
                      title='Train Set Histogram',
                      fit=None,
                      bins=100,
                      path=outfigs / f'{output}_y_hist_train.png')
            plot_hist(ydata[vl_id],
                      title='Val Set Histogram',
                      fit=None,
                      bins=100,
                      path=outfigs / f'{output}_y_hist_val.png')
            plot_hist(ydata[te_id],
                      title='Test Set Histogram',
                      fit=None,
                      bins=100,
                      path=outfigs / f'{output}_y_hist_test.png')

    import ipdb
    ipdb.set_trace()
    # print(te_dct[0])
    # print(te_dct[1])
    print(te_dct[0] == te_dct[1])
    return (tr_dct, vl_dct, te_dct)
コード例 #4
0
def run(args):
    t0 = time()
    n_splits = int(args.n_splits)
    te_size = verify_size(args.te_size)
    # te_size = args['te_size']
    datapath = Path(args.datapath).resolve()

    # Hard split
    split_on = None if args.split_on is None else args.split_on.upper()
    cv_method = args.cv_method
    te_method = cv_method

    # Specify ML task (regression or classification)
    if cv_method == 'strat':
        mltype = 'cls'  # cast mltype to cls in case of stratification
    else:
        mltype = args.ml_task

    # Target column name
    trg_name = str(args.trg_name)

    # -----------------------------------------------
    #       Create outdir
    # -----------------------------------------------
    if args.gout is not None:
        gout = Path(args.gout).resolve()
        gout = gout / datapath.with_suffix('.splits').name
    else:
        # Note! useful for drug response
        # sufx = 'none' if split_on is None else split_on
        # gout = gout / f'split_on_{sufx}'
        gout = datapath.with_suffix('.splits')

    outfigs = gout / 'outfigs'
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / 'data.splitter.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(vars(args))}')
    dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt')  # dump args

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn('\nLoad master dataset.')
    data = load_data(datapath)
    print_fn('data.shape {}'.format(data.shape))

    ydata = data[trg_name] if trg_name in data.columns else None
    if (ydata is None) and (cv_method == 'strat'):
        raise ValueError(
            'Y data must be available if splits are required to stratified.')
    if ydata is not None:
        plot_hist(ydata,
                  title=f'{trg_name}',
                  fit=None,
                  bins=100,
                  path=outfigs / f'{trg_name}_hist_all.png')

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn('\n{}'.format('-' * 50))
    print_fn('Split into hold-out train/val/test')
    print_fn('{}'.format('-' * 50))

    kwargs = {
        'data': data,
        'cv_method': cv_method,
        'te_method': te_method,
        'te_size': te_size,
        'mltype': mltype,
        'split_on': split_on
    }

    data_splitter(n_splits=n_splits,
                  gout=gout,
                  outfigs=outfigs,
                  ydata=ydata,
                  print_fn=print_fn,
                  **kwargs)

    print_fn('Runtime: {:.1f} min'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.kill_logger()
コード例 #5
0
ファイル: main_data_split.py プロジェクト: adpartin/pdx-histo
def run(args):

    print("\nInput args:")
    pprint(vars(args))

    t0 = time()
    te_size = verify_size(args.te_size)
    datapath = Path(args.datapath).resolve()

    # Hard split
    # split_on = None if args.split_on is None else args.split_on.upper()
    cv_method = args.cv_method
    te_method = cv_method

    # Specify ML task (regression or classification)
    if cv_method == "strat":
        mltask = "cls"  # cast mltask to cls in case of stratification
    else:
        mltask = args.ml_task

    # Target column name
    trg_name = str(args.trg_name)
    # assert args.trg_name in data.columns, f'The prediction target ({args.name}) \
    #     was not found in the dataset.'

    # import ipdb; ipdb.set_trace()

    # -----------------------------------------------
    #       Create outdir
    # -----------------------------------------------
    if args.gout is not None:
        gout = Path(args.gout).resolve()
        sufx = "none" if args.split_on is None else args.split_on
        gout = gout / datapath.with_suffix(".splits")
        if args.split_on is not None:
            gout = gout / f"split_on_{sufx}"
        else:
            gout = gout / f"split_on_none"
    else:
        # Note! useful for drug response
        sufx = "none" if args.split_on is None else args.split_on
        gout = datapath.with_suffix(".splits")

    outfigs = gout / "outfigs"
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / "data.splitter.log")
    print_fn = get_print_func(lg.logger)
    print_fn(f"File path: {fdir}")
    print_fn(f"\n{pformat(vars(args))}")
    dump_dict(vars(args), outpath=gout / "data.splitter.args.txt")

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn("\nLoad master dataset.")
    data = load_data(datapath)
    print_fn("data.shape {}".format(data.shape))

    # ydata = data[trg_name] if trg_name in data.columns else None
    # if (cv_method == "strat") and (ydata is None):
    #     raise ValueError("Prediction target column must be available if splits need to be stratified.")

    if (cv_method == "strat") and (trg_name not in data.columns):
        raise ValueError(
            "Prediction target column must be available if splits need to be stratified."
        )

    # if ydata is not None:
    #     plot_hist(ydata, title=f"{trg_name}", fit=None, bins=100,
    #               path=outfigs/f"{trg_name}_hist_all.png")

    if trg_name in data.columns:
        plot_hist(data[trg_name],
                  title=f"{trg_name}",
                  fit=None,
                  bins=100,
                  path=outfigs / f"{trg_name}_hist_all.png")

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn("\n{}".format("-" * 50))
    print_fn("Split data into hold-out train/val/test")
    print_fn("{}".format("-" * 50))

    kwargs = {
        "cv_method": cv_method,
        "te_method": te_method,
        "te_size": te_size,
        "mltask": mltask,
        "split_on": args.split_on
    }

    data_splitter(
        data=data,
        n_splits=args.n_splits,
        gout=gout,
        outfigs=outfigs,
        # ydata = ydata,
        target_name=trg_name,
        print_fn=print_fn,
        seed=seed,
        **kwargs)

    print_fn("Runtime: {:.1f} min".format((time() - t0) / 60))
    print_fn("Done.")
    lg.close_logger()