def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument('--cancer_types',
                    nargs='*',
                    help='cancer types to predict, if not included predict '
                    'all cancer types in TCGA')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped cancer types to')
    io.add_argument('--output_preds', action='store_true')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['cancer_type'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument(
        '--subset_mad_genes',
        type=int,
        default=cfg.num_features_raw,
        help='if included, subset gene features to this number of '
        'features having highest mean absolute deviation')
    opts.add_argument('--training_data',
                      type=str,
                      default='expression',
                      choices=list(cfg.data_types.keys()),
                      help='what data type to train model on')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    # check that all provided cancer types are valid TCGA acronyms
    sample_info_df = du.load_sample_info(args.training_data, args.verbose)
    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))

    if args.cancer_types is None:
        args.cancer_types = tcga_cancer_types
    else:
        not_in_tcga = set(args.cancer_types) - set(tcga_cancer_types)
        if len(not_in_tcga) > 0:
            parser.error('some cancer types not present in TCGA: {}'.format(
                ' '.join(not_in_tcga)))

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.n_dim = None
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types

    # add information about valid samples to model options
    model_options.sample_overlap_data_types = list(
        get_overlap_data_types(use_subsampled=model_options.debug).keys())

    return io_args, model_options, sample_info_df
Exemple #2
0
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument('--gene', default='TP53')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped genes to')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['mutation'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--nonlinear',
                      action='store_true',
                      help='use gradient-boosted classifier instead of the '
                      'default elastic net classifier')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument(
        '--subset_mad_genes',
        type=int,
        default=cfg.num_features_raw,
        help='if included, subset gene features to this number of '
        'features having highest mean absolute deviation')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']
    io_args.gene_set = [args.gene]

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.n_dim = None
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types
    model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type
    model_options.training_data = 'expression'
    model_options.overlap_data_types = ['expression']
    model_options.bc_titration = True

    return io_args, model_options
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument('--custom_genes',
                    nargs='*',
                    default=None,
                    help='currently this needs to be a subset of top_50')
    io.add_argument(
        '--gene_set',
        type=str,
        choices=['top_50', 'vogelstein', '50_random', 'custom'],
        default='top_50',
        help='choose which gene set to use. top_50 and vogelstein are '
        'predefined gene sets (see data_utilities), and custom allows '
        'any gene or set of genes in TCGA, specified in --custom_genes')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped genes to')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['mutation'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--batch_correction',
                      action='store_true',
                      help='if included, use limma to remove linear signal, '
                      'this is useful to determine how much non-linear signal '
                      'exists in the data')
    opts.add_argument(
        '--bc_cancer_type',
        action='store_true',
        help='if included, use limma to remove linear cancer type signal')
    opts.add_argument(
        '--bc_train_test',
        action='store_true',
        help='if included, fit BE correction model on train set, '
        'then apply to test set')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--drop_target',
                      action='store_true',
                      help='drop target gene from feature set, '
                      'currently only implemented for expression data')
    opts.add_argument(
        '--feature_selection',
        choices=['f_test', 'mad', 'random'],
        help='method to use for feature selection, only applied if '
        '0 > num_features > total number of columns')
    opts.add_argument(
        '--num_features',
        type=int,
        default=cfg.num_features_raw,
        help='if included, select this number of features, using '
        'feature selection method in feature_selection')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--nonlinear',
                      action='store_true',
                      help='use gradient-boosted classifier instead of the '
                      'default elastic net classifier')
    opts.add_argument('--only_target',
                      action='store_true',
                      help='use only target gene + non-gene covariates, '
                      'currently only implemented for expression data')
    opts.add_argument('--overlap_data_types',
                      nargs='*',
                      default=['expression'],
                      help='data types to define set of samples to use; e.g. '
                      'set of data types for a model comparison, use only '
                      'overlapping samples from these data types')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument('--training_data',
                      type=str,
                      default='expression',
                      choices=list(cfg.data_types.keys()),
                      help='what data type to train model on')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    if args.gene_set == 'custom':
        if args.custom_genes is None:
            parser.error(
                'must include --custom_genes when --gene_set=\'custom\'')
        args.gene_set = args.custom_genes
        del args.custom_genes
    elif (args.gene_set != 'custom' and args.custom_genes is not None):
        parser.error(
            'must use option --gene_set=\'custom\' if custom genes are included'
        )

    if args.drop_target and args.only_target:
        parser.error('drop_target and only_target are mutually exclusive')

    if (args.drop_target
            or args.only_target) and (args.training_data != 'expression'):
        parser.error(
            'drop_target and only_target only implemented for expression data')

    # check that all data types in overlap_data_types are valid
    check_all_data_types(parser, args.overlap_data_types, args.debug)

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.n_dim = None
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types
    model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type

    return io_args, model_options
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument('--custom_genes',
                    nargs='*',
                    default=None,
                    help='currently this needs to be a subset of top_50')
    io.add_argument(
        '--gene_set',
        type=str,
        choices=['top_50', 'vogelstein', 'custom'],
        default='top_50',
        help='choose which gene set to use. top_50 and vogelstein are '
        'predefined gene sets (see data_utilities), and custom allows '
        'any gene or set of genes in TCGA, specified in --custom_genes')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped genes to')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['multimodal'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--n_dim',
                      nargs='*',
                      default=None,
                      help='list of compressed dimensions to use, defaults to '
                      'uncompressed data for all data types')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--overlap_data_types',
                      nargs='*',
                      default=['expression'],
                      help='data types to define set of samples to use; e.g. '
                      'set of data types for a model comparison, use only '
                      'overlapping samples from these data types')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument(
        '--subset_mad_genes',
        type=int,
        default=cfg.num_features_raw,
        help='if included, subset gene features to this number of '
        'features having highest mean absolute deviation')
    opts.add_argument('--training_data',
                      nargs='*',
                      default=['expression'],
                      help='which data types to train model on')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    if args.gene_set == 'custom':
        if args.custom_genes is None:
            parser.error(
                'must include --custom_genes when --gene_set=\'custom\'')
        args.gene_set = args.custom_genes
        del args.custom_genes
    elif (args.gene_set != 'custom' and args.custom_genes is not None):
        parser.error(
            'must use option --gene_set=\'custom\' if custom genes are included'
        )

    # check that all training data types are defined in config
    if (len(set(args.training_data).intersection(set(cfg.data_types.keys())))
            != len(set(args.training_data))):
        parser.error('training_data data types must be in config.data_types')

    # check that all data types in overlap_data_types are valid
    #
    # here I'm just checking this argument against the non-compressed data types,
    # downstream code will check if data types we request compressed data for
    # really have compressed data, but don't need to catch that here
    check_all_data_types(parser, args.overlap_data_types, args.debug)

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # if no n_dim argument provided, set all to None
    if model_options.n_dim is None:
        model_options.n_dim = [None] * len(model_options.training_data)
    else:
        # convert None strings from argparse to python Nones
        model_options.n_dim = ([
            None if n == 'None' else n for n in model_options.n_dim
        ])

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios

    # for these experiments, we want to standardize all data types to make them
    # comparable when used as predictive features
    model_options.standardize_data_types = ([
        t for ix, t in enumerate(model_options.training_data)
    ])
    model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type

    return io_args, model_options
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group('io',
                                   'arguments related to script input/output, '
                                   'note these will *not* be saved in metadata ')
    io.add_argument('--log_file', default=None,
                    help='name of file to log errors to')
    io.add_argument('--output_preds', action='store_true')
    io.add_argument('--results_dir', default=cfg.results_dirs['controls'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group('model_options',
                                     'parameters for training/evaluating model, '
                                     'these will affect output and are saved as '
                                     'experiment metadata ')
    opts.add_argument('--classify', action='store_true',
                      help='if included, binarize tumor purity values into '
                           'above and below median, otherwise predict '
                           'continuous purity values using regression')
    opts.add_argument('--debug', action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--num_folds', type=int, default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument('--subset_mad_genes', type=int, default=cfg.num_features_raw,
                      help='if included, subset gene features to this number of '
                           'features having highest mean absolute deviation')
    opts.add_argument('--training_data', type=str, default='expression',
                      choices=list(cfg.data_types.keys()),
                      help='what data type to train model on')
    opts.add_argument('--use_compressed', action='store_true',
                      help='use PCA compressed data rather than raw features')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    if args.use_compressed and args.training_data not in cfg.compressed_data_types:
        parser.error(
            'data type {} does not have a compressed data source'.format(
                args.training_data)
        )

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # always use 5000 PCs if `use_compressed==True`
    model_options.n_dim = None
    if model_options.use_compressed:
        model_options.n_dim = 5000

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    if model_options.classify:
        model_options.max_iter = cfg.max_iter
        model_options.alphas = cfg.alphas
        model_options.l1_ratios = cfg.l1_ratios
    else:
        model_options.max_iter = cfg.reg_max_iter
        model_options.alphas = cfg.reg_alphas
        model_options.l1_ratios = cfg.reg_l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types

    # add information about valid samples to model options
    model_options.sample_overlap_data_types = list(
        get_overlap_data_types(
            use_subsampled=model_options.debug,
            compressed_data=model_options.use_compressed
        ).keys()
    )

    return io_args, model_options
Exemple #6
0
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group('io',
                                   'arguments related to script input/output, '
                                   'note these will *not* be saved in metadata ')
    io.add_argument('--cancer_types', nargs='*', default=['all_cancer_types'],
                    help='cancer types to run, \'pancancer\' for a pan-cancer model '
                         'combining cancer types, default is all individual TCGA '
                         'cancer types + pan-cancer model')
    io.add_argument('--log_file', default=None,
                    help='name of file to log errors to')
    io.add_argument('--results_dir', default=cfg.results_dirs['msi'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group('model_options',
                                     'parameters for training/evaluating model, '
                                     'these will affect output and are saved as '
                                     'experiment metadata ')
    opts.add_argument('--debug', action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--num_folds', type=int, default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--overlap_data_types', nargs='*',
                      default=['expression'],
                      help='data types to define set of samples to use; e.g. '
                           'set of data types for a model comparison, use only '
                           'overlapping samples from these data types')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument('--subset_mad_genes', type=int, default=cfg.num_features_raw,
                      help='if included, subset gene features to this number of '
                           'features having highest mean absolute deviation')
    opts.add_argument('--training_data', type=str, default='expression',
                      choices=list(cfg.data_types.keys()),
                      help='what data type to train model on')
    # TODO use survival method for compression?
    opts.add_argument('--use_compressed', action='store_true',
                      help='use PCA compressed data rather than raw features')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    if args.use_compressed and args.training_data not in cfg.compressed_data_types:
        parser.error(
            'data type {} does not have a compressed data source'.format(
                args.training_data)
        )

    msi_cancer_types = cfg.msi_cancer_types + ['pancancer']
    if 'all_cancer_types' in args.cancer_types:
        args.cancer_types = msi_cancer_types
    else:
        not_in_msi = set(args.cancer_types) - set(msi_cancer_types)
        if len(not_in_msi) > 0:
            parser.error('some cancer types do not have MSI labels: {}'.format(
                ' '.join(not_in_msi)))

    # check that all data types in overlap_data_types are valid
    check_all_data_types(parser, args.overlap_data_types, args.debug)

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # always use 5000 PCs if `use_compressed==True`
    # TODO: just use n_dim option?
    model_options.n_dim = None
    if model_options.use_compressed:
        model_options.n_dim = 5000

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.max_iter = cfg.max_iter
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types

    return io_args, model_options
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument(
        '--cancer_types',
        nargs='*',
        default=['all_cancer_types'],
        help='cancer types to run, \'pancancer\' for a pan-cancer model '
        'combining cancer types, default is all individual TCGA '
        'cancer types + pan-cancer model')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped cancer types to')
    io.add_argument('--output_survival_fn', action='store_true')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['survival'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument(
        '--fit_ridge',
        action='store_true',
        help='if included, fit ridge-regularized survival model instead '
        'of elastic net model. this tends to converge slightly faster '
        'and more robustly on smaller feature sets, but may fit slowly '
        'or not at all on large sets of features')
    opts.add_argument(
        '--n_dim',
        default=None,
        help='number of compressed components/dimensions to use, '
        'None to use raw features')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--overlap_data_types',
                      nargs='*',
                      default=['expression'],
                      help='data types to define set of samples to use; e.g. '
                      'set of data types for a model comparison, use only '
                      'overlapping samples from these data types')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument(
        '--subset_mad_genes',
        type=int,
        default=cfg.num_features_raw,
        help='if included, subset gene features to this number of '
        'features having highest mean absolute deviation')
    opts.add_argument('--training_data',
                      type=str,
                      default='expression',
                      choices=list(cfg.data_types.keys()) + ([
                          'baseline', 'vogelstein_mutations',
                          'significant_mutations', 'mutation_preds_expression',
                          'mutation_preds_me_27k', 'mutation_preds_me_450k'
                      ]),
                      help='what data type to train model on')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    if args.n_dim is not None:
        args.n_dim = int(args.n_dim)

    if args.training_data == 'baseline':
        sample_info_df = (du.load_sample_info('expression',
                                              verbose=args.verbose))
    else:
        sample_info_df = (du.load_sample_info(args.training_data,
                                              verbose=args.verbose))

    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))
    tcga_cancer_types.append('pancancer')
    if 'all_cancer_types' in args.cancer_types:
        args.cancer_types = tcga_cancer_types
    else:
        not_in_tcga = set(args.cancer_types) - set(tcga_cancer_types)
        if len(not_in_tcga) > 0:
            parser.error('some cancer types not present in TCGA: {}'.format(
                ' '.join(not_in_tcga)))

    # check that all data types in overlap_data_types are valid
    check_all_data_types(parser, args.overlap_data_types, args.debug)

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.max_iter = cfg.max_iter_map['survival']
    model_options.alphas = cfg.alphas_map['survival']
    model_options.l1_ratios = cfg.l1_ratios_map['survival']
    model_options.standardize_data_types = cfg.standardize_data_types
    model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type

    return io_args, model_options, sample_info_df