def _load_data(self, sample_info=None, debug=False, test=False):
        """Load and store relevant data.

        This data does not vary based on the gene/cancer type being considered
        (i.e. it can be loaded only once when the class is instantiated).

        Arguments:
        ----------
        debug (bool): whether or not to subset data for faster debugging
        test (bool): whether or not to subset columns in mutation data, for testing
        """
        # load expression data
        self.rnaseq_df = du.load_expression_data(verbose=self.verbose,
                                                 debug=debug)
        if sample_info is None:
            self.sample_info_df = du.load_sample_info(verbose=self.verbose)
        else:
            self.sample_info_df = sample_info

        # load and unpack pancancer data
        # this data is described in more detail in the load_pancancer_data docstring
        if test:
            # for testing, just load a subset of pancancer data,
            # this is much faster than loading mutation data for all genes
            pancan_data = du.load_pancancer_data(verbose=self.verbose,
                                                 test=True,
                                                 subset_columns=cfg.test_genes)
        else:
            pancan_data = du.load_pancancer_data(verbose=self.verbose)

        (self.sample_freeze_df, self.mutation_df, self.copy_loss_df,
         self.copy_gain_df, self.mut_burden_df) = pancan_data
def process_args():
    p = argparse.ArgumentParser()
    p.add_argument('--custom_genes',
                   nargs='*',
                   default=None,
                   help='currently this needs to be a subset of top_50')
    p.add_argument('--debug',
                   action='store_true',
                   help='use subset of data for fast debugging')
    p.add_argument(
        '--gene_set',
        type=str,
        choices=['top_50', 'vogelstein', 'custom'],
        default='top_50',
        help='choose which gene set to use. top_50 and vogelstein are '
        'predefined gene sets (see data_utilities), and custom allows '
        'any gene or set of genes in TCGA, specified in --custom_genes')
    p.add_argument('--holdout_cancer_types',
                   nargs='*',
                   default=None,
                   help='provide a list of cancer types to hold out, uses all '
                   'cancer types in TCGA if none are provided')
    p.add_argument('--how_to_add',
                   type=str,
                   choices=['random', 'similarity'],
                   default='random',
                   help='Method for choosing cancer types to add to the '
                   'training dataset; see data model for details')
    p.add_argument('--log_file',
                   default=None,
                   help='name of file to log skipped cancer types to')
    p.add_argument('--num_folds',
                   type=int,
                   default=4,
                   help='number of folds of cross-validation to run')
    p.add_argument('--results_dir',
                   default=cfg.results_dir,
                   help='where to write results to')
    p.add_argument('--seed', type=int, default=cfg.default_seed)
    p.add_argument('--subset_mad_genes',
                   type=int,
                   default=cfg.num_features_raw,
                   help='if included, subset gene features to this number of '
                   'features having highest mean absolute deviation')
    p.add_argument('--verbose', action='store_true')
    args = p.parse_args()

    if args.gene_set == 'custom':
        if args.custom_genes is None:
            p.error('must include --custom_genes when --gene_set=\'custom\'')
        args.gene_set = args.custom_genes
        del args.custom_genes
    elif (args.gene_set != 'custom' and args.custom_genes is not None):
        p.error(
            'must use option --gene_set=\'custom\' if custom genes are included'
        )

    sample_info_df = du.load_sample_info(args.verbose)
    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))
    if args.holdout_cancer_types is None:
        args.holdout_cancer_types = tcga_cancer_types
    else:
        not_in_tcga = set(args.holdout_cancer_types) - set(tcga_cancer_types)
        if len(not_in_tcga) > 0:
            p.error('some cancer types not present in TCGA: {}'.format(
                ' '.join(not_in_tcga)))

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    return args, sample_info_df
Beispiel #3
0
def data_model():
    """Load data model and sample info data"""
    # TODO: define results dir?
    tcga_data = TCGADataModel(debug=True, test=True)
    sample_info_df = du.load_sample_info()
    return tcga_data, sample_info_df
        log_df.to_csv(args.log_file, sep='\t')

    tcga_data = TCGADataModel(seed=args.seed,
                              subset_mad_genes=args.subset_mad_genes,
                              verbose=args.verbose,
                              debug=args.debug)

    # same sampled genes from cross-cancer individual identifier experiments
    genes_df = du.load_vogelstein()
    sampled_genes = [
        'APC', 'BRCA1', 'EGFR', 'FGFR2', 'H3F3A', 'HRAS', 'MSH2', 'PIK3CA',
        'PPP2R1A', 'VHL'
    ]

    # and use all cancer types in TCGA
    sample_info_df = du.load_sample_info(args.verbose)
    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))

    # identifiers have the format {gene}_{cancer_type}
    test_identifiers = [
        '_'.join(t) for t in it.product(sampled_genes, tcga_cancer_types)
    ]

    # create output directory
    output_dir = Path(args.results_dir, 'pan_cross_cancer').resolve()
    output_dir.mkdir(parents=True, exist_ok=True)

    for shuffle_labels in (False, True):

        print('shuffle_labels: {}'.format(shuffle_labels))
def expression_data():
    """Load gene expression and sample info data from files"""
    rnaseq_df = pd.read_csv(cfg.test_expression, index_col=0, sep='\t')
    sample_info_df = du.load_sample_info()
    return rnaseq_df, sample_info_df
def process_args():
    p = argparse.ArgumentParser()
    p.add_argument('--coral',
                   action='store_true',
                   help='if true, use CORAL method to align source and'
                   'target distributions')
    p.add_argument('--coral_by_cancer_type',
                   action='store_true',
                   help='if true, use CORAL method to align source and'
                   'target distributions, per cancer type')
    p.add_argument('--coral_lambda', type=float, default=1.0)
    p.add_argument('--custom_genes',
                   nargs='*',
                   default=None,
                   help='currently this needs to be a subset of top_50')
    p.add_argument('--debug',
                   action='store_true',
                   help='use subset of data for fast debugging')
    p.add_argument(
        '--gene_set',
        type=str,
        choices=['top_50', 'vogelstein', 'custom'],
        default='top_50',
        help='choose which gene set to use. top_50 and vogelstein are '
        'predefined gene sets (see data_utilities), and custom allows '
        'any gene or set of genes in TCGA, specified in --custom_genes')
    p.add_argument('--holdout_cancer_types',
                   nargs='*',
                   default=None,
                   help='provide a list of cancer types to hold out, uses all '
                   'cancer types in TCGA if none are provided')
    p.add_argument('--log_file',
                   default=None,
                   help='name of file to log skipped cancer types to')
    p.add_argument('--num_folds',
                   type=int,
                   default=4,
                   help='number of folds of cross-validation to run')
    p.add_argument(
        '--pancancer_only',
        action='store_true',
        help='if included, omit test cancer type data from training '
        'set for pancancer experiments')
    p.add_argument('--results_dir',
                   default=cfg.results_dir,
                   help='where to write results to')
    p.add_argument('--seed', type=int, default=cfg.default_seed)
    p.add_argument('--subset_mad_genes',
                   type=int,
                   default=cfg.num_features_raw,
                   help='if included, subset gene features to this number of '
                   'features having highest mean absolute deviation')
    p.add_argument('--tca',
                   action='store_true',
                   help='if true, use TCA method to map source and target'
                   'data into same feature space')
    p.add_argument('--tca_kernel_type',
                   choices=['linear', 'rbf'],
                   default='linear')
    p.add_argument('--tca_mu', type=float, default=0.1)
    p.add_argument('--tca_n_components', type=int, default=100)
    p.add_argument('--verbose', action='store_true')
    args = p.parse_args()

    if args.gene_set == 'custom':
        if args.custom_genes is None:
            p.error('must include --custom_genes when --gene_set=\'custom\'')
        args.gene_set = args.custom_genes
        del args.custom_genes
    elif (args.gene_set != 'custom' and args.custom_genes is not None):
        p.error(
            'must use option --gene_set=\'custom\' if custom genes are included'
        )

    sample_info_df = du.load_sample_info(args.verbose)
    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))
    if args.holdout_cancer_types is None:
        args.holdout_cancer_types = tcga_cancer_types
    else:
        not_in_tcga = set(args.holdout_cancer_types) - set(tcga_cancer_types)
        if len(not_in_tcga) > 0:
            p.error('some cancer types not present in TCGA: {}'.format(
                ' '.join(not_in_tcga)))

    if args.tca:
        args.tca_params = {
            'mu': args.tca_mu,
            'kernel_type': args.tca_kernel_type,
            'sigma': 1.0,
            'n_components': args.tca_n_components
        }
    else:
        args.tca_params = None

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    return args, sample_info_df