class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.loc[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests) / (N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens + nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].iloc[0:N] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=16) M = max(data1 + data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M + 1]) pylab.legend(loc='lower right') try: pylab.tight_layout() except: pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50 + 1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = { 'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful } df = pd.DataFrame(data, columns=[ 'significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits' ], index=fdrs) df.columns = [ '1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong' ] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print( "Creating individual HTML pages for each significant association" ) df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.ix[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests)/(N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens+nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].ix[0:N-1] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=18) M = max(data1+data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M+1]) pylab.legend(loc='lower right') try:pylab.tight_layout() except:pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50+1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = {'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful} df = pd.DataFrame(data, columns = ['significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits'], index=fdrs) df.columns = ['1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong'] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print("Creating individual HTML pages for each significant association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')