def data_model(data_type): """Load data model and sample info data""" # passing arguments to fixtures (like data_type here), then using them # in tests isn't widely documented in pytest, but seems to work # see, e.g. https://stackoverflow.com/a/60148972 tcga_data = TCGADataModel(training_data=data_type, debug=True, test=True) sample_info_df = du.load_sample_info(train_data_type=data_type) return tcga_data, sample_info_df
def generate_data_model(data_type, verbose=False): """Load data model and sample info data""" tcga_data = TCGADataModel(training_data=data_type, test=True, verbose=verbose) sample_info_df = du.load_sample_info(train_data_type=data_type, verbose=verbose) return tcga_data, sample_info_df
model_options.alphas = cfg.alphas model_options.l1_ratios = cfg.l1_ratios model_options.standardize_data_types = cfg.standardize_data_types model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type model_options.training_data = 'expression' model_options.overlap_data_types = ['expression'] model_options.bc_titration = True return io_args, model_options if __name__ == '__main__': # process command line arguments io_args, model_options = process_args() sample_info_df = du.load_sample_info(model_options.training_data, verbose=io_args.verbose) # create results dir and subdir for experiment if they don't exist experiment_dir = Path(io_args.results_dir, 'gene').resolve() experiment_dir.mkdir(parents=True, exist_ok=True) # save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = ['gene', 'titration_ratio', 'shuffle_labels', 'skip_reason'] tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes,
sns.boxplot(data=plot_df, x='signal', y='auroc', hue='training_data', ax=axarr[1]) axarr[1].set_title( 'Binarized tumor purity prediction performance, by data type') axarr[1].set_xlabel('Signal or shuffled') axarr[1].set_ylabel('AUROC') axarr[1].legend(title='Data type') # ### Plot results faceted by cancer type # In[5]: sample_info_df = du.load_sample_info('expression') results_df = au.load_purity_by_cancer_type(results_dir, sample_info_df) print(results_df.training_data.unique()) results_df.head() # In[6]: top_cancer_types = (sample_info_df.groupby('cancer_type').count().drop( columns=['id_for_stratification']).rename(columns={ 'sample_type': 'count' }).sort_values(by='count', ascending=False)) top_cancer_types.head() # In[7]: sns.set({'figure.figsize': (15, 12)})
def process_args(): """Parse and format command line arguments.""" parser = argparse.ArgumentParser() # argument group for parameters related to input/output # (e.g. filenames, logging/verbosity options, target genes) # # these don't affect the model output, and thus don't need to be saved # with the results of the experiment io = parser.add_argument_group( 'io', 'arguments related to script input/output, ' 'note these will *not* be saved in metadata ') io.add_argument('--cancer_types', nargs='*', help='cancer types to predict, if not included predict ' 'all cancer types in TCGA') io.add_argument('--log_file', default=None, help='name of file to log skipped cancer types to') io.add_argument('--output_preds', action='store_true') io.add_argument('--results_dir', default=cfg.results_dirs['cancer_type'], help='where to write results to') io.add_argument('--verbose', action='store_true') # argument group for parameters related to model training/evaluation # (e.g. model hyperparameters, preprocessing options) # # these affect the output of the model, so we want to save them in the # same directory as the experiment results opts = parser.add_argument_group( 'model_options', 'parameters for training/evaluating model, ' 'these will affect output and are saved as ' 'experiment metadata ') opts.add_argument('--debug', action='store_true', help='use subset of data for fast debugging') opts.add_argument('--num_folds', type=int, default=4, help='number of folds of cross-validation to run') opts.add_argument('--seed', type=int, default=cfg.default_seed) opts.add_argument( '--subset_mad_genes', type=int, default=cfg.num_features_raw, help='if included, subset gene features to this number of ' 'features having highest mean absolute deviation') opts.add_argument('--training_data', type=str, default='expression', choices=list(cfg.data_types.keys()), help='what data type to train model on') args = parser.parse_args() args.results_dir = Path(args.results_dir).resolve() if args.log_file is None: args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve() # check that all provided cancer types are valid TCGA acronyms sample_info_df = du.load_sample_info(args.training_data, args.verbose) tcga_cancer_types = list(np.unique(sample_info_df.cancer_type)) if args.cancer_types is None: args.cancer_types = tcga_cancer_types else: not_in_tcga = set(args.cancer_types) - set(tcga_cancer_types) if len(not_in_tcga) > 0: parser.error('some cancer types not present in TCGA: {}'.format( ' '.join(not_in_tcga))) # split args into defined argument groups, since we'll use them differently arg_groups = du.split_argument_groups(args, parser) io_args, model_options = arg_groups['io'], arg_groups['model_options'] # add some additional hyperparameters/ranges from config file to model options # these shouldn't be changed by the user, so they aren't added as arguments model_options.n_dim = None model_options.alphas = cfg.alphas model_options.l1_ratios = cfg.l1_ratios model_options.standardize_data_types = cfg.standardize_data_types # add information about valid samples to model options model_options.sample_overlap_data_types = list( get_overlap_data_types(use_subsampled=model_options.debug).keys()) return io_args, model_options, sample_info_df
return subsample_df if __name__ == '__main__': p = argparse.ArgumentParser() p.add_argument('--dataset', type=str, choices=['expression', 'me_27k', 'all'], default='all') p.add_argument('--verbose', action='store_true') args = p.parse_args() cfg.subsampled_data_dir.mkdir(parents=True, exist_ok=True) if args.dataset in ['expression', 'all']: sample_info_df = du.load_sample_info(train_data_type='expression', verbose=args.verbose) rnaseq_df = du.load_raw_data(train_data_type='expression', verbose=args.verbose) if args.verbose: print('Generating subsampled expression data...', end='') subsample_df = subsample_stratified(rnaseq_df, sample_info_df) subsample_df.to_csv(cfg.subsampled_expression, sep='\t', compression='gzip', float_format='%.3g') if args.verbose: print('done') if args.dataset in ['me_27k', 'all']: sample_info_df = du.load_sample_info(train_data_type='me_27k', verbose=args.verbose)
def calculate_gene_count(overlap_data_types, seeds, num_folds): """For a set of data types, calculate the number of valid genes.""" gene_seed_list = [] sample_info_df = du.load_sample_info('expression') for seed in seeds: tcga_data = TCGADataModel(seed=seed, overlap_data_types=overlap_data_types) genes_df = tcga_data.load_gene_set('vogelstein') for gene_ix, gene_series in genes_df.iterrows(): print(gene_series.gene, file=sys.stderr) try: tcga_data.process_data_for_gene(gene_series.gene, gene_series.classification, None) except KeyError: continue y_ones = np.count_nonzero(tcga_data.y_df.status) y_zeroes = len(tcga_data.y_df.status) - y_ones print(y_ones, y_zeroes, file=sys.stderr) # check if any valid cancer types, if not break if tcga_data.X_df.shape[0] == 0: gene_seed_list.append((gene_series.gene, seed, False, 'no_valid_cancer_types')) continue # subset features to speed up CV tcga_data.X_df = tcga_data.X_df.iloc[:, :50] # if valid cancer types, look at CV folds and make sure each # has 0 and 1 labels gene_seed_valid = True reason = 'N/A' for fold_no in range(num_folds): with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='The least populated class in y') X_train, X_test, _ = cv.split_stratified( tcga_data.X_df, sample_info_df, num_folds=num_folds, fold_no=fold_no, seed=seed ) y_train = tcga_data.y_df.reindex(X_train.index) y_test = tcga_data.y_df.reindex(X_test.index) # count 0/1 labels in y_train and y_test y_train_ones = np.count_nonzero(y_train.status) y_train_zeroes = len(y_train.status) - y_train_ones y_test_ones = np.count_nonzero(y_test.status) y_test_zeroes = len(y_test.status) - y_test_ones print(fold_no, y_train_ones, y_train_zeroes, y_test_ones, y_test_zeroes, file=sys.stderr) if ((y_train_ones == 0) or (y_train_zeroes == 0)): gene_seed_valid = False reason = 'one_train_class' break elif ((y_test_ones == 0) or (y_test_zeroes == 0)): gene_seed_valid = False reason = 'one_test_class' break gene_seed_list.append((gene_series.gene, seed, gene_seed_valid, reason)) return gene_seed_list
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler import mpmp.config as cfg import mpmp.utilities.data_utilities as du # In[2]: DATA_TYPE = 'mut_sigs' # load gene/classification info and sample/cancer type info print('Loading gene label data...', file=sys.stderr) genes_df = du.load_vogelstein() sample_info_df = du.load_sample_info(DATA_TYPE, verbose=True) # load mutation info # this returns a tuple of dataframes, unpack it below pancancer_data = du.load_pancancer_data(verbose=True) (sample_freeze_df, mutation_df, copy_loss_df, copy_gain_df, mut_burden_df) = pancancer_data # In[3]: # load relevant data data_df = du.load_raw_data(DATA_TYPE, verbose=True) # standardize columns of expression dataframe if DATA_TYPE in cfg.standardize_data_types: print('Standardizing columns of {} data...'.format(DATA_TYPE),
# this is the number of valid genes in the Vogelstein gene set NUM_GENES = 85 # sample random genes from set of genes with every gene with >= NUM_CANCERS # valid cancer types # # if we sampled them randomly from all genes, it's likely that many of them # would end up with no valid cancer types (i.e. not enough mutations to train # a classifier), so we add this criterion to make sure they have at least one NUM_CANCERS = 1 # ### Load mutation and sample/cancer type info # In[3]: sample_info_df = du.load_sample_info('expression', verbose=True) pancancer_data = du.load_pancancer_data(verbose=True) mutation_df = pancancer_data[1] mut_burden_df = pancancer_data[4] print(sample_info_df.shape) print(mutation_df.shape) print(mut_burden_df.shape) # In[4]: # merge sample info and mutation burden info hyper_filter = 5 print(mutation_df.shape) mutations_df = (mutation_df.merge(sample_info_df,
compare_results_df['nlog10_p'], compare_results_df.identifier, compare_results_df.reject_null, plt.gca()) adjust_text(text_labels, ax=plt.gca()) # ## Confusion matrix # In[10]: import os import mpmp.utilities.data_utilities as du preds_dir = os.path.join(cfg.results_dirs['cancer_type'], 'results_preds', 'cancer_type') sample_info_df = du.load_sample_info() preds_expression_df = au.load_preds_to_matrix(preds_dir, sample_info_df, training_data='expression') print(preds_expression_df.shape) preds_expression_df.iloc[:5, :5] # In[11]: sns.set({'figure.figsize': (15, 10)}) ax = sns.heatmap( preds_expression_df, cbar_kws={ 'label': 'Predicted probability of positive label, averaged over samples'
def process_args(): """Parse and format command line arguments.""" parser = argparse.ArgumentParser() # argument group for parameters related to input/output # (e.g. filenames, logging/verbosity options, target genes) # # these don't affect the model output, and thus don't need to be saved # with the results of the experiment io = parser.add_argument_group( 'io', 'arguments related to script input/output, ' 'note these will *not* be saved in metadata ') io.add_argument( '--cancer_types', nargs='*', default=['all_cancer_types'], help='cancer types to run, \'pancancer\' for a pan-cancer model ' 'combining cancer types, default is all individual TCGA ' 'cancer types + pan-cancer model') io.add_argument('--log_file', default=None, help='name of file to log skipped cancer types to') io.add_argument('--output_survival_fn', action='store_true') io.add_argument('--results_dir', default=cfg.results_dirs['survival'], help='where to write results to') io.add_argument('--verbose', action='store_true') # argument group for parameters related to model training/evaluation # (e.g. model hyperparameters, preprocessing options) # # these affect the output of the model, so we want to save them in the # same directory as the experiment results opts = parser.add_argument_group( 'model_options', 'parameters for training/evaluating model, ' 'these will affect output and are saved as ' 'experiment metadata ') opts.add_argument('--debug', action='store_true', help='use subset of data for fast debugging') opts.add_argument( '--fit_ridge', action='store_true', help='if included, fit ridge-regularized survival model instead ' 'of elastic net model. this tends to converge slightly faster ' 'and more robustly on smaller feature sets, but may fit slowly ' 'or not at all on large sets of features') opts.add_argument( '--n_dim', default=None, help='number of compressed components/dimensions to use, ' 'None to use raw features') opts.add_argument('--num_folds', type=int, default=4, help='number of folds of cross-validation to run') opts.add_argument('--overlap_data_types', nargs='*', default=['expression'], help='data types to define set of samples to use; e.g. ' 'set of data types for a model comparison, use only ' 'overlapping samples from these data types') opts.add_argument('--seed', type=int, default=cfg.default_seed) opts.add_argument( '--subset_mad_genes', type=int, default=cfg.num_features_raw, help='if included, subset gene features to this number of ' 'features having highest mean absolute deviation') opts.add_argument('--training_data', type=str, default='expression', choices=list(cfg.data_types.keys()) + ([ 'baseline', 'vogelstein_mutations', 'significant_mutations', 'mutation_preds_expression', 'mutation_preds_me_27k', 'mutation_preds_me_450k' ]), help='what data type to train model on') args = parser.parse_args() args.results_dir = Path(args.results_dir).resolve() if args.log_file is None: args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve() if args.n_dim is not None: args.n_dim = int(args.n_dim) if args.training_data == 'baseline': sample_info_df = (du.load_sample_info('expression', verbose=args.verbose)) else: sample_info_df = (du.load_sample_info(args.training_data, verbose=args.verbose)) tcga_cancer_types = list(np.unique(sample_info_df.cancer_type)) tcga_cancer_types.append('pancancer') if 'all_cancer_types' in args.cancer_types: args.cancer_types = tcga_cancer_types else: not_in_tcga = set(args.cancer_types) - set(tcga_cancer_types) if len(not_in_tcga) > 0: parser.error('some cancer types not present in TCGA: {}'.format( ' '.join(not_in_tcga))) # check that all data types in overlap_data_types are valid check_all_data_types(parser, args.overlap_data_types, args.debug) # split args into defined argument groups, since we'll use them differently arg_groups = du.split_argument_groups(args, parser) io_args, model_options = arg_groups['io'], arg_groups['model_options'] # add some additional hyperparameters/ranges from config file to model options # these shouldn't be changed by the user, so they aren't added as arguments model_options.max_iter = cfg.max_iter_map['survival'] model_options.alphas = cfg.alphas_map['survival'] model_options.l1_ratios = cfg.l1_ratios_map['survival'] model_options.standardize_data_types = cfg.standardize_data_types model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type return io_args, model_options, sample_info_df
def _load_data(self, train_data_type, compressed_data=False, standardize_input=False, n_dim=None, sample_info_df=None, debug=False, test=False): """Load and store relevant data. This data does not vary based on the gene/cancer type being considered (i.e. it can be loaded only once when the class is instantiated). Arguments: ---------- debug (bool): whether or not to subset data for faster debugging test (bool): whether or not to subset columns in mutation data, for testing """ # first load and unpack pancancer mutation/CNV/TMB data # this data is described in more detail in the load_pancancer_data docstring if test: # for testing, just load a subset of pancancer data, # this is much faster than loading mutation data for all genes import mpmp.test_config as tcfg pancan_data = du.load_pancancer_data( verbose=self.verbose, test=True, subset_columns=tcfg.test_genes) else: pancan_data = du.load_pancancer_data(verbose=self.verbose) (self.sample_freeze_df, self.mutation_df, self.copy_loss_df, self.copy_gain_df, self.mut_burden_df) = pancan_data # now load training data if not isinstance(train_data_type, str): # if a list of train data types is provided, we have to load each # of them and concatenate columns # n_dim should be a list here self.data_df, self.data_types = du.load_multiple_data_types( train_data_type, n_dims=n_dim, standardize_input=standardize_input, verbose=self.verbose) elif compressed_data: self.data_df = du.load_compressed_data( train_data_type, n_dim=n_dim, verbose=self.verbose, standardize_input=standardize_input, load_subset=(debug or test)) elif train_data_type == 'baseline': # we just want to use non-omics covariates as a baseline # so here, get sample list for expression data, then create an # empty data frame using it as an index if sample_info_df is None: sample_info_df = du.load_sample_info('expression', verbose=self.verbose) self.data_df = pd.DataFrame(index=sample_info_df.index) else: if train_data_type == 'vogelstein_mutations': self.data_df = self._load_vogelstein_mutation_matrix() elif train_data_type == 'significant_mutations': data_df = self._load_vogelstein_mutation_matrix() sig_genes = du.load_significant_genes('methylation') # startswith() with a tuple argument returns True if # the string matches any of the prefixes in the tuple # https://stackoverflow.com/a/20461857 self.data_df = data_df.loc[:, data_df.columns.str. startswith(tuple(sig_genes))] elif 'mutation_preds' in train_data_type: self.data_df = du.load_mutation_predictions(train_data_type) else: self.data_df = du.load_raw_data(train_data_type, verbose=self.verbose, load_subset=(debug or test)) if sample_info_df is None: self.sample_info_df = du.load_sample_info(train_data_type, verbose=self.verbose) else: # sometimes we load sample info in the calling script as part of # argument processing, etc # in that case, we don't need to load it again self.sample_info_df = sample_info_df