def _load_data(self, sample_info=None, debug=False, test=False): """Load and store relevant data. This data does not vary based on the gene/cancer type being considered (i.e. it can be loaded only once when the class is instantiated). Arguments: ---------- debug (bool): whether or not to subset data for faster debugging test (bool): whether or not to subset columns in mutation data, for testing """ # load expression data self.rnaseq_df = du.load_expression_data(verbose=self.verbose, debug=debug) if sample_info is None: self.sample_info_df = du.load_sample_info(verbose=self.verbose) else: self.sample_info_df = sample_info # load and unpack pancancer data # this data is described in more detail in the load_pancancer_data docstring if test: # for testing, just load a subset of pancancer data, # this is much faster than loading mutation data for all genes pancan_data = du.load_pancancer_data(verbose=self.verbose, test=True, subset_columns=cfg.test_genes) else: pancan_data = du.load_pancancer_data(verbose=self.verbose) (self.sample_freeze_df, self.mutation_df, self.copy_loss_df, self.copy_gain_df, self.mut_burden_df) = pancan_data
def process_args(): p = argparse.ArgumentParser() p.add_argument('--custom_genes', nargs='*', default=None, help='currently this needs to be a subset of top_50') p.add_argument('--debug', action='store_true', help='use subset of data for fast debugging') p.add_argument( '--gene_set', type=str, choices=['top_50', 'vogelstein', 'custom'], default='top_50', help='choose which gene set to use. top_50 and vogelstein are ' 'predefined gene sets (see data_utilities), and custom allows ' 'any gene or set of genes in TCGA, specified in --custom_genes') p.add_argument('--holdout_cancer_types', nargs='*', default=None, help='provide a list of cancer types to hold out, uses all ' 'cancer types in TCGA if none are provided') p.add_argument('--how_to_add', type=str, choices=['random', 'similarity'], default='random', help='Method for choosing cancer types to add to the ' 'training dataset; see data model for details') p.add_argument('--log_file', default=None, help='name of file to log skipped cancer types to') p.add_argument('--num_folds', type=int, default=4, help='number of folds of cross-validation to run') p.add_argument('--results_dir', default=cfg.results_dir, help='where to write results to') p.add_argument('--seed', type=int, default=cfg.default_seed) p.add_argument('--subset_mad_genes', type=int, default=cfg.num_features_raw, help='if included, subset gene features to this number of ' 'features having highest mean absolute deviation') p.add_argument('--verbose', action='store_true') args = p.parse_args() if args.gene_set == 'custom': if args.custom_genes is None: p.error('must include --custom_genes when --gene_set=\'custom\'') args.gene_set = args.custom_genes del args.custom_genes elif (args.gene_set != 'custom' and args.custom_genes is not None): p.error( 'must use option --gene_set=\'custom\' if custom genes are included' ) sample_info_df = du.load_sample_info(args.verbose) tcga_cancer_types = list(np.unique(sample_info_df.cancer_type)) if args.holdout_cancer_types is None: args.holdout_cancer_types = tcga_cancer_types else: not_in_tcga = set(args.holdout_cancer_types) - set(tcga_cancer_types) if len(not_in_tcga) > 0: p.error('some cancer types not present in TCGA: {}'.format( ' '.join(not_in_tcga))) args.results_dir = Path(args.results_dir).resolve() if args.log_file is None: args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve() return args, sample_info_df
def data_model(): """Load data model and sample info data""" # TODO: define results dir? tcga_data = TCGADataModel(debug=True, test=True) sample_info_df = du.load_sample_info() return tcga_data, sample_info_df
log_df.to_csv(args.log_file, sep='\t') tcga_data = TCGADataModel(seed=args.seed, subset_mad_genes=args.subset_mad_genes, verbose=args.verbose, debug=args.debug) # same sampled genes from cross-cancer individual identifier experiments genes_df = du.load_vogelstein() sampled_genes = [ 'APC', 'BRCA1', 'EGFR', 'FGFR2', 'H3F3A', 'HRAS', 'MSH2', 'PIK3CA', 'PPP2R1A', 'VHL' ] # and use all cancer types in TCGA sample_info_df = du.load_sample_info(args.verbose) tcga_cancer_types = list(np.unique(sample_info_df.cancer_type)) # identifiers have the format {gene}_{cancer_type} test_identifiers = [ '_'.join(t) for t in it.product(sampled_genes, tcga_cancer_types) ] # create output directory output_dir = Path(args.results_dir, 'pan_cross_cancer').resolve() output_dir.mkdir(parents=True, exist_ok=True) for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels))
def expression_data(): """Load gene expression and sample info data from files""" rnaseq_df = pd.read_csv(cfg.test_expression, index_col=0, sep='\t') sample_info_df = du.load_sample_info() return rnaseq_df, sample_info_df
def process_args(): p = argparse.ArgumentParser() p.add_argument('--coral', action='store_true', help='if true, use CORAL method to align source and' 'target distributions') p.add_argument('--coral_by_cancer_type', action='store_true', help='if true, use CORAL method to align source and' 'target distributions, per cancer type') p.add_argument('--coral_lambda', type=float, default=1.0) p.add_argument('--custom_genes', nargs='*', default=None, help='currently this needs to be a subset of top_50') p.add_argument('--debug', action='store_true', help='use subset of data for fast debugging') p.add_argument( '--gene_set', type=str, choices=['top_50', 'vogelstein', 'custom'], default='top_50', help='choose which gene set to use. top_50 and vogelstein are ' 'predefined gene sets (see data_utilities), and custom allows ' 'any gene or set of genes in TCGA, specified in --custom_genes') p.add_argument('--holdout_cancer_types', nargs='*', default=None, help='provide a list of cancer types to hold out, uses all ' 'cancer types in TCGA if none are provided') p.add_argument('--log_file', default=None, help='name of file to log skipped cancer types to') p.add_argument('--num_folds', type=int, default=4, help='number of folds of cross-validation to run') p.add_argument( '--pancancer_only', action='store_true', help='if included, omit test cancer type data from training ' 'set for pancancer experiments') p.add_argument('--results_dir', default=cfg.results_dir, help='where to write results to') p.add_argument('--seed', type=int, default=cfg.default_seed) p.add_argument('--subset_mad_genes', type=int, default=cfg.num_features_raw, help='if included, subset gene features to this number of ' 'features having highest mean absolute deviation') p.add_argument('--tca', action='store_true', help='if true, use TCA method to map source and target' 'data into same feature space') p.add_argument('--tca_kernel_type', choices=['linear', 'rbf'], default='linear') p.add_argument('--tca_mu', type=float, default=0.1) p.add_argument('--tca_n_components', type=int, default=100) p.add_argument('--verbose', action='store_true') args = p.parse_args() if args.gene_set == 'custom': if args.custom_genes is None: p.error('must include --custom_genes when --gene_set=\'custom\'') args.gene_set = args.custom_genes del args.custom_genes elif (args.gene_set != 'custom' and args.custom_genes is not None): p.error( 'must use option --gene_set=\'custom\' if custom genes are included' ) sample_info_df = du.load_sample_info(args.verbose) tcga_cancer_types = list(np.unique(sample_info_df.cancer_type)) if args.holdout_cancer_types is None: args.holdout_cancer_types = tcga_cancer_types else: not_in_tcga = set(args.holdout_cancer_types) - set(tcga_cancer_types) if len(not_in_tcga) > 0: p.error('some cancer types not present in TCGA: {}'.format( ' '.join(not_in_tcga))) if args.tca: args.tca_params = { 'mu': args.tca_mu, 'kernel_type': args.tca_kernel_type, 'sigma': 1.0, 'n_components': args.tca_n_components } else: args.tca_params = None args.results_dir = Path(args.results_dir).resolve() if args.log_file is None: args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve() return args, sample_info_df