def add_features(self): # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [ "{}-{}".format(x, get_name(x)) for x in df_drugs.index ] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') if len(self.report.df) == 0: return # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID', url="associations/drug_", suffix=".html") table.df.columns = [ x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns ] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE', url="associations/", suffix=".html") table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False)
def _create_report(self, onweb=True): # A summary table diag = self.report.diagnostics() table = HTMLTable(diag, 'summary') txt = '' for index, row in diag.iterrows(): if len(row.text) == 0 and len(row.value) == 0: txt += '----<br/>' else: txt += row.text + ": " + str(row.value) + "<br/>" self.jinja['summary'] = txt print('Creating volcano plots') # this can be pretty slow. so keep only 1000 most relevant # values and 1000 random ones to get an idea of the distribution v = VolcanoANOVA(self.report.df, settings=self.settings) v.selector(v.df, 1500, 1500, inplace=True) v.volcano_plot_all() v.savefig_and_js("volcano_all_js") self.jinja['volcano'] = """ <h3></h3> <a href="volcano_all_js.html"> <img alt="volcano plot for all associations" src="volcano_all_js.png"> </a> <br/> <p> A javascript version is available <a href="volcano_all_js.html">here</a> ( or click on the image).</p> """ # MANOVA link N = len(self.report.get_significant_set()) self.jinja['manova'] = """ There were %(N)s significant associations found. All significant associations have been gatherered in the following link: <br/><a href="manova.html">manova results</a>. """ % {'N': N} # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') # drug summary #not_tested = [x for x in self.report.gdsc.drugIds if x not in # self.report.df.DRUG_ID.unique()] #if len(not_tested) > 0: # not_tested = """%s drugs were not analysed due to # lack of valid data points: """ % len(not_tested) + \ # ", ".join(not_tested) #else: # not_tested = "" not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID') table.df.columns = [x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE') table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False) # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html() # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join([self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv'} filename = os.sep.join([self.settings.directory, 'code','rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()
def add_features(self): # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = ["{}-{}".format(x, get_name(x)) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') if len(self.report.df) == 0: return # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID', url="associations/drug_", suffix=".html") table.df.columns = [x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE', url="associations/", suffix=".html") table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False)
def _create_report(self, onweb=True): # A summary table diag = self.report.diagnostics() table = HTMLTable(diag, 'summary') txt = '' for index, row in diag.iterrows(): if len(row.text) == 0 and len(row.value) == 0: txt += '----<br/>' else: txt += row.text + ": " + str(row.value) + "<br/>" self.jinja['summary'] = txt print('Creating volcano plots') # this can be pretty slow. so keep only 1000 most relevant # values and 1000 random ones to get an idea of the distribution v = VolcanoANOVA(self.report.df, settings=self.settings) v.selector(v.df, 1500, 1500, inplace=True) v.volcano_plot_all() v.savefig_and_js("volcano_all_js") self.jinja['volcano'] = """ <h3></h3> <a href="volcano_all_js.html"> <img alt="volcano plot for all associations" src="volcano_all_js.png"> </a> <br/> <p> A javascript version is available <a href="volcano_all_js.html">here</a> ( or click on the image).</p> """ # MANOVA link N = len(self.report.get_significant_set()) self.jinja['manova'] = """ There were %(N)s significant associations found. All significant associations have been gatherered in the following link: <br/><a href="manova.html">manova results</a>. """ % { 'N': N } # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') # drug summary #not_tested = [x for x in self.report.gdsc.drugIds if x not in # self.report.df.DRUG_ID.unique()] #if len(not_tested) > 0: # not_tested = """%s drugs were not analysed due to # lack of valid data points: """ % len(not_tested) + \ # ", ".join(not_tested) #else: # not_tested = "" not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID') table.df.columns = [ x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns ] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE') table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False) # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html() # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join( [self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv' } filename = os.sep.join([self.settings.directory, 'code', 'rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()