def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None
def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", main_directory="tissue_packages", verbose=True): """.. rubric:: Constructor :param ic50: an :class:`~gdsctools.readers.IC50` file. :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file. :param genomic_feature_pattern: a glob to a set of :class:`~gdsctools.readers.GenomicFeature` files. """ super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose) assert isinstance(ic50, str) self.ic50_filename = ic50 self.dd_filename = drug_decode self.main_directory = main_directory self.settings = ANOVASettings() self.settings.animate = False self.drug_decode = DrugDecode(drug_decode) print("Those settings will be used (check FDR_threshold)") print(self.settings) # figure out the cancer types: self.results = {} self.company_directory = "company_packages" # quick test on 15 features self.test = False
def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", mode='standard'): super(GDSC, self).__init__(genomic_feature_pattern, verbose=True) self.debug = False self.ic50_filename = ic50 self.dd_filename = drug_decode if mode == 'v18': self.ic50 = IC50Cluster(ic50) else: self.ic50 = IC50(ic50) self.drug_decode = DrugDecode(drug_decode) self.settings = ANOVASettings() self.settings.low_memory = True if mode == 'v18': self.settings.FDR_threshold = 35 print( "Those settings will be used (note that low_memory is set " "to True and check the value of FDR_threshold (set to 35 in v18)") print(self.settings) # figure out the cancer types: self.results = {}
def __init__(self, gdsc, results, sep="\t", drug_decode=None): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all` """ self.figtools = Savefig() self.gdsc = gdsc self.df = ANOVAResults(results).df # this does a copy and sanity check self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) # create some data self._set_sensible_df() # just to create the directory ReportMAIN(directory=self.settings.directory)
def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None
def test_drugs(): r1 = DrugDecode(testing.drug_test_csv) r1.drugIds r2 = DrugDecode(testing.drug_test_tsv) r2.drugIds assert r1 == r2 # r1.get_info() this example fails because all webrelease are NAN assert len(r1) == 11 dd = DrugDecode(gdsctools_data("test_drug_decode_comp.csv")) assert dd.companies == ["ME"] assert dd.is_public(5) == 'Y' dd.check() assert dd.get_info()['N_prop'] == 1 # test repr and print print(dd) dd # test __add__ assert dd + dd == dd assert len(dd.get_public_and_one_company("ME")) == 10
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.loc[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests) / (N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens + nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].iloc[0:N] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=16) M = max(data1 + data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M + 1]) pylab.legend(loc='lower right') try: pylab.tight_layout() except: pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50 + 1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = { 'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful } df = pd.DataFrame(data, columns=[ 'significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits' ], index=fdrs) df.columns = [ '1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong' ] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print( "Creating individual HTML pages for each significant association" ) df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')
def _create_summary_pages(self, main_directory, verbose=True, company=None): # Read all directories in tissue_packages directories = glob.glob(main_directory + os.sep + '*') summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[-1] if tcga not in self.tcga: continue if verbose: print(directory, tcga) # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = hits['Unnamed: 0'].unique() results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = results.df.DRUG_ID.unique() else: drug_ids = [] # where to find the DRUG DECODE file. Should # have been copied path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] try: df.sort_values(by="Number of hits", ascending=False, inplace=True) except: df.sort("Number of hits", ascending=False, inplace=True) output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMain(directory=main_directory, filename='index.html', template_filename='datapack_summary.html', mode="summary") # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') self.html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html( collapse_table=False) if company: self.html_page.jinja["collaborator"] = company self.html_page.write() return df
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def _create_summary_pages(self, main_directory, verbose=True, company=None): # Read all directories in tissue_packages directories = glob.glob(main_directory + os.sep + '*') summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[-1] if tcga not in self.tcga: continue if verbose: print(directory, tcga) # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = hits['Unnamed: 0'].unique() results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = results.df.DRUG_ID.unique() else: drug_ids = [] # where to find the DRUG DECODE file. Should # have been copied path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public']]) df = pd.DataFrame(summary) df.columns = ['Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of'] try: df.sort_values(by="Number of hits", ascending=False, inplace=True) except: df.sort("Number of hits", ascending=False, inplace=True) output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMain(directory=main_directory, filename='index.html', template_filename='datapack_summary.html', mode="summary") # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') self.html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html( collapse_table=False) if company: self.html_page.jinja["collaborator"] = company self.html_page.write() return df
def create_summary_pages(self, main_directory="tissue_packages"): # Read in ALL all directories # create directories and copy relevant files directories = glob.glob(main_directory + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images', 'INPUT', 'OUTPUT', 'code', 'js' ]: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public']]) df = pd.DataFrame(summary) df.columns = ['Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of'] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory=main_directory, filename='index.html', template_filename='datapack_summary.html') # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.jinja['collaborator'] = main_directory #self.html_page.jinja['analysis_domain'] = 'v18' self.html_page.write() return df
def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename ]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename ]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename ]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep +filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results)>0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public']]) df = pd.DataFrame(summary) df.columns = ['Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of'] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html' ) # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df
def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep + filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html') # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1)
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.ix[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests)/(N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens+nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].ix[0:N-1] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=18) M = max(data1+data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M+1]) pylab.legend(loc='lower right') try:pylab.tight_layout() except:pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50+1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = {'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful} df = pd.DataFrame(data, columns = ['significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits'], index=fdrs) df.columns = ['1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong'] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print("Creating individual HTML pages for each significant association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')
def test_drugs(): r1 = DrugDecode(testing.drug_test_csv) r1.drugIds r2 = DrugDecode(testing.drug_test_tsv) r2.drugIds assert r1 == r2