def __init__( self, data=None, row_method="complete", column_method="complete", row_metric="euclidean", column_metric="euclidean", cmap="yellow_black_blue", col_side_colors=None, row_side_colors=None, verbose=True, ): """.. rubric:: constructor :param data: a dataframe or possibly a numpy matrix. :param row_method: complete by default :param column_method: complete by default. See linkage module for details :param row_metric: euclidean by default :param column_metric: euclidean by default :param cmap: colormap. any matplotlib accepted or combo of colors as defined in colormap package (pypi) :param col_side_colors: :param row_side_colors: """ # should be a copy since it may be reshuffled ? try: if data is None and verbose is True: print( "No data provided, please fill the `df` attribute manually" ) elif data is None: pass else: self._df = data.copy() except AttributeError as err: print("input must be a pandas data frame or numpy matrix") raise (err) self._row_method = row_method self._column_method = column_method self._column_metric = column_metric self._row_metric = row_metric # some default parameters self.cluster_criterion = "distance" self.params = easydev.AttrDict() self.params.col_side_colors = ["r", "g", "b", "y", "w", "k", "m"] self.params.row_side_colors = ["r", "g", "b", "y", "w", "k", "m"] self.params.cmap = cmap self.category_row = {} self.category_column = {} if col_side_colors: self.params.col_side_colors = col_side_colors if row_side_colors: self.params.row_side_colors = row_side_colors
def __init__(self, data=None, row_method='complete', column_method='complete', row_metric='euclidean', column_metric='euclidean', cmap='yellow_black_blue', col_side_colors=None, row_side_colors=None, verbose=True): """.. rubric:: constructor :param data: a dataframe or possibly a numpy matrix. .. todo:: if row_method id none, no ordering in the dendogram """ # should be a copy since it may be reshuffled ? try: if data is None and verbose is True: print( "No data provided, please fill the `df` attribute manually" ) else: self._df = data.copy() except AttributeError as err: print("input must be a pandas data frame or numpy matrix") raise (err) self._row_method = row_method self._column_method = column_method self._column_metric = column_metric self._row_metric = row_metric # some default parameters self.cluster_criterion = 'distance' self.params = easydev.AttrDict() self.params.col_side_colors = ['r', 'g', 'b', 'y', 'w', 'k', 'm'] self.params.row_side_colors = ['r', 'g', 'b', 'y', 'w', 'k', 'm'] self.params.cmap = cmap self.category_row = None self.category_column = None if col_side_colors: self.params.col_side_colors = col_side_colors if row_side_colors: self.params.row_side_colors = row_side_colors
def _build_testing(): testing = easydev.AttrDict() d = Data() d.filename = _gsf('test_drug_decode.tsv') d.description = 'drug_decode in TSV format' testing.drug_test_tsv = d d = Data() d.filename = _gsf('test_drug_decode.csv') d.description = 'drug_decode in CSV format' testing.drug_test_csv = d d = Data() d.filename = _gsf('test_ic50_11_50.csv') d.description = 'A 10drug/50 cell lines IC50 test file in CSV format' testing.ic50_test_csv = d d = Data() d.filename = _gsf('test_genomic_features.csv') d.description = 'A 50 cell lines by 20 features GenomicFeature in CSV format' testing.genomic_features_csv = d d = Data() d.filename = _gsf('test_IC50.csv') d.description = 'A 10drug/1000 cell lines IC50 test file in CSV format' testing.ic50_test = d d = Data() d.filename = _gsf('test_IC50_header2.csv') d.description = 'An IC50 test (header with column without Drug_ prefix)' testing.ic50_test_header_no_drug_prefix = d d = Data() d.filename = _gsf('test_IC50_header1.csv') d.description = 'An IC50 test (header with column with Drug_ prefix only)' testing.ic50_test_header_drug_prefix_only = d d = Data() d.filename = _gsf('test_IC50_header3.csv') d.description = 'An IC50 test (header with mixed prefixes i.e. Drug_ or not)' testing.ic50_test_header_mixed_drug_prefix = d d = Data() d.filename = _gsf('test_genomic_features_bare.csv') d.description = "A 50 cell lines by 17 features without MSI/tissue/sample" testing.genomic_features_bare_csv = d return testing
def __init__(self, data=None, method='complete', metric='euclidean', cmap='yellow_black_blue', col_side_colors=None, side_colors=None, verbose=True, horizontal=True): """.. rubric:: constructor :param data: a dataframe or possibly a numpy matrix. :param method: complete by default :param metric: euclidean by default :param cmap: colormap. any matplotlib accepted or combo of colors as defined in colormap package (pypi) :param col_side_colors: :param side_colors: """ # should be a copy since it may be reshuffled ? try: if data is None and verbose is True: print( "No data provided, please fill the `df` attribute manually" ) elif data is None: pass else: self._df = data.copy() except AttributeError as err: print("input must be a pandas data frame or numpy matrix") raise (err) self._method = method self._metric = metric self.horizontal = True # some default parameters self.cluster_criterion = 'distance' self.params = easydev.AttrDict() self.params.side_colors = ['r', 'g', 'b', 'y', 'w', 'k', 'm'] self.params.cmap = cmap self.category = {} if side_colors: self.params.side_colors = side_colors
class GenomicFeatures(Reader, CosmicRows): """Read Matrix with Genomic Features These are the compulsary column names required (note the spaces): - 'COSMIC_ID' - 'TISSUE_FACTOR' - 'MSI_FACTOR' If one of the following column is found, it is removed (deprecated):: - 'SAMPLE_NAME' - 'Sample Name' - 'CELL_LINE' and features can be also encoded with the following convention: - columns ending in "_mut" to encode a gene mutation (e.g., BRAF_mut) - columns starting with "gain_cna" - columns starting with "loss_cna" Those columns will be removed: - starting with `Drug_`, which are supposibly from the IC50 matrix :: >>> from gdsctools import GenomicFeatures >>> gf = GenomicFeatures() >>> print(gf) Genomic features distribution Number of unique tissues 27 Number of unique features 677 with - Mutation: 270 - CNA (gain): 116 - CNA (loss): 291 .. versionchanged:: 0.9.10 The header's columns' names have changed to be more consistant. Previous names are deprecated but still accepted. .. versionchanged:: 0.9.15 If a tissue is empty, it is replaced by UNDEFINED. We also strip the spaces to make sure there is "THIS" and "THIS " are the same. """ colnames = easydev.AttrDict() colnames.cosmic = 'COSMIC_ID' colnames.tissue = 'TISSUE_FACTOR' colnames.msi = 'MSI_FACTOR' colnames.media = 'MEDIA_FACTOR' def __init__(self, filename=None, empty_tissue_name="UNDEFINED"): """.. rubric:: Constructor If no file is provided, using the default file provided in the package that is made of 1001 cell lines times 680 features. :param str empty_tissue_name: if a tissue name is let empty, replace it with this string. """ # first reset the filename to the shared data (if not provided) if filename is None: from gdsctools.datasets import genomic_features filename = genomic_features # used in the header so should be ser before call to super() super(GenomicFeatures, self).__init__(filename) # FIXME Remove columns related to Drug if any. Can be removed in # the future self.df = self.df[[ x for x in self.df.columns if x.startswith('Drug_') is False ]] for this in ['Sample Name', 'SAMPLE_NAME', 'Sample_Name', 'CELL_LINE']: if this in self.df.columns: self.df.drop(this, axis=1, inplace=True) # Let us rename "COSMIC ID" into "COSMIC_ID" if needed for old, new in { 'Tissue Factor Value': 'TISSUE_FACTOR', 'MS-instability Factor Value': 'MSI_FACTOR', 'COSMIC ID': 'COSMIC_ID' }.items(): if old in self.df.columns: colorlog.warning( "'%s' column name is deprecated " % old + " since 0.9.10. Please replace with '%s'" % new, DeprecationWarning) self.df.columns = [ x.replace(old, new) for x in self.df.columns ] if "CL" in self.df.columns and "COSMID_ID" not in self.df.columns: self.df.columns = [ x.replace("CL", "COSMIC_ID") for x in self.df.columns ] # There are 3 special columns to hold the factors self._special_names = [] # If tissue factor is not provided, we create and fill it with dummies. # OTherwise, we need to change a lot in the original code in ANOVA if self.colnames.tissue not in self.df.columns: colorlog.warning( "column named '%s' not found" % self.colnames.tissue, UserWarning) self.df[self.colnames.tissue] = ['UNDEFINED'] * len(self.df) self._special_names.append(self.colnames.tissue) else: self._special_names.append(self.colnames.tissue) self.found_msi = self.colnames.msi in self.df.columns if self.found_msi is False: colorlog.warning("column named '%s' not found" % self.colnames.msi) else: self._special_names.append(self.colnames.msi) self.found_media = self.colnames.media in self.df.columns if self.found_media is False: pass #colorlog.warning("column named '%s' not found" % self.colnames.media) else: self._special_names.append(self.colnames.media) # order columns and index self._order() # self._interpret_cosmic() # self.check() self._fix_empty_tissues(empty_tissue_name) def _fix_empty_tissues(self, name="UNDEFINED"): # Sometimes, tissues may be empty so a nan is present. This lead to # to errors in ANOVA or Regression so we replace them with "UNDEFINED" N = self.df.TISSUE_FACTOR.isnull().sum() if N > 0: logger.warning( "Some tissues were empty strings and renamed as UNDEFINED!") self.df.TISSUE_FACTOR.fillna('UNDEFINED', inplace=True) def _get_shift(self): return len(self._special_names) shift = property(_get_shift) def _interpret_cosmic(self): if self.colnames.cosmic in self.df.columns: self.df.set_index(self.colnames.cosmic, inplace=True) elif self.colnames.cosmic == self.df.index.name: pass else: error_msg = "the features input file must contains a column " +\ " named %s" % self.colnames.cosmic raise ValueError(error_msg) self.df.index = [int(x) for x in self.df.index] self.df.index = self.df.index.astype(int) self.df.index.name = "COSMIC_ID" self.df.sort_index(inplace=True) def fill_media_factor(self): """Given the COSMIC identifiers, fills the MEDIA_FACTOR column If already populated, replaced by new content. """ from gdsctools import COSMICInfo c = COSMICInfo() self.df['MEDIA_FACTOR'] = [ c.get(x).SCREEN_MEDIUM for x in self.df.index ] self.found_media = True if self.colnames.media not in self._special_names: self._special_names.append(self.colnames.media) self._order() def _order(self): others = [x for x in self.df.columns if x not in self._special_names] self.df = self.df[self._special_names + others] def _get_features(self): return list(self.df.columns) def _set_features(self, features): for feature in features: if feature not in self.features: raise ValueError('Unknown feature name %s' % feature) features = [x for x in features if x.endswith('FACTOR') is False] features = self._special_names + features self.df = self.df[features] self._order() features = property(_get_features, _set_features, doc="return list of features") def _get_tissues(self): return list(self.df[self.colnames.tissue]) tissues = property(_get_tissues, doc='return list of tissues') def _get_unique_tissues(self): return list(self.df[self.colnames.tissue].unique()) unique_tissues = property(_get_unique_tissues, doc='return set of tissues') def plot(self): """Histogram of the tissues found .. plot:: :include-source: :width: 80% from gdsctools import GenomicFeatures gf = GenomicFeatures() # use the default file gf.plot() """ if self.colnames.tissue not in self.df.columns: return data = pd.get_dummies(self.df[self.colnames.tissue]).sum() data.index = [x.replace("_", " ") for x in data.index] # deprecated but works for python 3.3 try: data.sort_values(ascending=False) except: data.sort(ascending=False) pylab.figure(1) pylab.clf() labels = list(data.index) pylab.pie(data, labels=labels) pylab.figure(2) data.plot(kind='barh') pylab.grid() pylab.xlabel('Occurences') # keep the try to prevent MacOS issue try: pylab.tight_layout() except: pass return data def __str__(self): txt = 'Genomic features distribution\n' try: tissues = list(self.df[self.colnames.tissue].unique()) Ntissue = len(tissues) txt += 'Number of unique tissues {0}'.format(Ntissue) if Ntissue == 1: txt += ' ({0})\n'.format(tissues[0]) elif Ntissue < 10: txt += '\nHere are the tissues: ' txt += ",".join(tissues) + "\n" else: txt += '\nHere are the first 10 tissues: ' txt += ", ".join(tissues[0:10]) + "\n" except: txt += 'No information about tissues\n' if self.found_msi: txt += "MSI column: yes\n" else: txt += "MSI column: no\n" if self.found_media: txt += "MEDIA column: yes\n" else: txt += "MEDIA column: no\n" # -3 since we have also the MSI, tissue, media columns # TODO should use shift attribute ? Nfeatures = len(self.features) txt += '\nThere are {0} unique features distributed as\n'.format( Nfeatures - self.shift) n_mutations = len([x for x in self.df.columns if x.endswith("_mut")]) txt += "- Mutation: {}\n".format(n_mutations) n_gain = len([x for x in self.df.columns if x.startswith("gain_cna")]) txt += "- CNA (gain): {}\n".format(n_gain) n_loss = len([x for x in self.df.columns if x.startswith("loss_cna")]) txt += "- CNA (loss): {}".format(n_loss) return txt def drop_tissue_in(self, tissues): """Drop tissues from the list :param list tissues: a list of tissues to drop. If you have only one tissue, can be provided as a string. Since rows are removed some features (columns) may now be empty (all zeros). If so, those columns are dropped (except for the special columns (e.g, MSI). """ tissues = easydev.to_list(tissues) mask = self.df[self.colnames.tissue].isin(tissues) == False self.df = self.df[mask] self._cleanup() def keep_tissue_in(self, tissues): """Drop tissues not in the list :param list tissues: a list of tissues to keep. If you have only one tissue, can be provided as a string. Since rows are removed some features (columns) may now be empty (all zeros). If so, those columns are dropped (except for the special columns (e.g, MSI). """ tissues = easydev.to_list(tissues) mask = self.df[self.colnames.tissue].isin(tissues) self.df = self.df[mask] self._cleanup() def _cleanup(self, required_features=0): # FIXME: there is view/copy warning here in pandas. it should be fixed # or may have side-effects to_ignore = self._special_names # create a view ignoring the informative columns view = self.df[[x for x in self.df.columns if x not in to_ignore]] todrop = list(view.columns[view.sum() <= required_features]) self.df.drop(todrop, axis=1, inplace=True) def __repr__(self): Nc = len(self.cosmicIds) Nf = len(self.features) - self.shift try: Nt = len(set(self.tissues)) except: Nt = '?' return "GenomicFeatures <Nc={0}, Nf={1}, Nt={2}>".format(Nc, Nf, Nt) def compress_identical_features(self): """Merge duplicated columns/features Columns duplicated are merged as follows. Fhe first column is kept, others are dropped but to keep track of those dropped, the column name is renamed by concatenating the columns's names. The separator is a double underscore. :: gf = GenomicFeatures() gf.compress_identical_features() # You can now access to the column as follows (arbitrary example) gf.df['ARHGAP26_mut__G3BP2_mut'] """ # let us identify the duplicates as True/False datatr = self.df.transpose() duplicated_no_first = datatr[datatr.duplicated()] try: duplicated = datatr[datatr.duplicated(keep=False)] except: # pandas 0.16 duplicated = datatr[datatr.duplicated(take_last=False)] tokeep = [ x for x in duplicated.index if x not in duplicated_no_first.index ] # Let us create a groupby strategy groups = {} # Let us now add the corrsponding duplicats for feature in tokeep: # Find all row identical to this feature matches = (duplicated.ix[feature] == duplicated).all(axis=1) groups[feature] = "__".join(duplicated.index[matches]) # This drops all duplicated columns (the first is kept, others are # dropped) self.df = self.df.transpose().drop_duplicates().transpose() self.df.rename(columns=groups, inplace=True) # We want to keep the column names informative that is if there were # duplicates, we rename the column kept with the concatenation of all # the corresponding duplicates print("compressed %s groups of duplicates" % len(groups)) return groups def get_TCGA(self): from gdsctools.cosmictools import COSMICInfo c = COSMICInfo() tcga = c.df.ix[self.df.index].TCGA return tcga