Python load_pancancer_data Examples, mpmp.utilities.data_utilities.load_pancancer_data Python Examples

Example #1

0

Show file

File: plot_survival_curves.py Project: greenelab/mpmp

    except KeyError:
        continue

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(), title='Stage at diagnosis')
plt.xlabel('Time (days)')
plt.ylabel('Predicted survival probability')

# ### Color survival curves by cancer subtype
#
# Here, we color the survival curves by cancer subtype. Generally, there are pronounced differences in survival between subtypes of any given cancer, although the exact differences depend on the cancer type.

# In[10]:

sample_freeze_df = du.load_pancancer_data()[0]
sample_freeze_df.head()

# In[11]:

subtypes = sample_freeze_df[sample_freeze_df.DISEASE ==
                            cancer_type].SUBTYPE.unique()
subtype_to_ix = {st: ix for ix, st in enumerate(sorted(subtypes))}
print(subtype_to_ix)

# In[12]:

sns.set({'figure.figsize': (10, 8)})

# color by stage at diagnosis
for ix in range(len(fns)):

Example #2

0

Show file

# In[2]:

(cancer_types_df, cancertype_codes_dict, sample_types_df,
 sampletype_codes_dict) = tu.get_tcga_barcode_info()
cancer_types_df.head(2)

# In[3]:

sample_types_df.head(2)

# ### Load and process somatic mutation data

# In[4]:

pancan_data = du.load_pancancer_data(verbose=True)
sample_freeze_df = pancan_data[0]

print(sample_freeze_df.duplicated(['SAMPLE_BARCODE']).sum())
assert (sample_freeze_df.duplicated(['SAMPLE_BARCODE']).sum() == 0)

sample_freeze_df.set_index('SAMPLE_BARCODE', inplace=True)
sample_freeze_df.index.rename('sample_id', inplace=True)
sample_freeze_df.head()

# ### Process TCGA cancer type and sample type info from barcodes
#
# See https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes for more details.

# In[5]:

Example #3

0

Show file

File: 2_sample_intersection.py Project: greenelab/mpmp

# get sample list for each -omics data type
sample_lists = {}
for training_data, sample_info_file in cfg.sample_infos.items():
    samples = pd.read_csv(sample_info_file, sep='\t', index_col=0).index
    try:
        sample_lists[data_map[training_data]] = set(samples)
    except KeyError:
        # bias-corrected results, ignore them here
        import sys
        print(training_data, file=sys.stderr)
        continue

# In[4]:

# add mutation data to sample list
pancan_data = du.load_pancancer_data()
(sample_freeze_df, mutation_df, copy_loss_df, copy_gain_df,
 mut_burden_df) = pancan_data

print(sample_freeze_df.shape)
print(mutation_df.shape)
print(copy_loss_df.shape)
print(copy_gain_df.shape)
print(mut_burden_df.shape)

# In[5]:

# all these dfs contain the same samples, so just use one of the indexes
sample_lists['mutation'] = set(mutation_df.index)

# In[6]:

Example #4

0

Show file

    def _load_data(self,
                   train_data_type,
                   compressed_data=False,
                   standardize_input=False,
                   n_dim=None,
                   sample_info_df=None,
                   debug=False,
                   test=False):
        """Load and store relevant data.

        This data does not vary based on the gene/cancer type being considered
        (i.e. it can be loaded only once when the class is instantiated).

        Arguments:
        ----------
        debug (bool): whether or not to subset data for faster debugging
        test (bool): whether or not to subset columns in mutation data, for testing
        """
        # first load and unpack pancancer mutation/CNV/TMB data
        # this data is described in more detail in the load_pancancer_data docstring
        if test:
            # for testing, just load a subset of pancancer data,
            # this is much faster than loading mutation data for all genes
            import mpmp.test_config as tcfg
            pancan_data = du.load_pancancer_data(
                verbose=self.verbose,
                test=True,
                subset_columns=tcfg.test_genes)
        else:
            pancan_data = du.load_pancancer_data(verbose=self.verbose)

        (self.sample_freeze_df, self.mutation_df, self.copy_loss_df,
         self.copy_gain_df, self.mut_burden_df) = pancan_data

        # now load training data
        if not isinstance(train_data_type, str):
            # if a list of train data types is provided, we have to load each
            # of them and concatenate columns
            # n_dim should be a list here
            self.data_df, self.data_types = du.load_multiple_data_types(
                train_data_type,
                n_dims=n_dim,
                standardize_input=standardize_input,
                verbose=self.verbose)
        elif compressed_data:
            self.data_df = du.load_compressed_data(
                train_data_type,
                n_dim=n_dim,
                verbose=self.verbose,
                standardize_input=standardize_input,
                load_subset=(debug or test))
        elif train_data_type == 'baseline':
            # we just want to use non-omics covariates as a baseline
            # so here, get sample list for expression data, then create an
            # empty data frame using it as an index
            if sample_info_df is None:
                sample_info_df = du.load_sample_info('expression',
                                                     verbose=self.verbose)
            self.data_df = pd.DataFrame(index=sample_info_df.index)
        else:
            if train_data_type == 'vogelstein_mutations':
                self.data_df = self._load_vogelstein_mutation_matrix()
            elif train_data_type == 'significant_mutations':
                data_df = self._load_vogelstein_mutation_matrix()
                sig_genes = du.load_significant_genes('methylation')
                # startswith() with a tuple argument returns True if
                # the string matches any of the prefixes in the tuple
                # https://stackoverflow.com/a/20461857
                self.data_df = data_df.loc[:,
                                           data_df.columns.str.
                                           startswith(tuple(sig_genes))]
            elif 'mutation_preds' in train_data_type:
                self.data_df = du.load_mutation_predictions(train_data_type)
            else:
                self.data_df = du.load_raw_data(train_data_type,
                                                verbose=self.verbose,
                                                load_subset=(debug or test))

        if sample_info_df is None:
            self.sample_info_df = du.load_sample_info(train_data_type,
                                                      verbose=self.verbose)
        else:
            # sometimes we load sample info in the calling script as part of
            # argument processing, etc
            # in that case, we don't need to load it again
            self.sample_info_df = sample_info_df