def get_html_table(self, collapse_table=False, clip_threshold=2, index=False, header=True, escape=False): cmap_clip = cmap_builder('#ffffff', '#0070FF') cmap_absmax = cmap_builder('green', 'white', 'red') columns = ANOVAResults().colnames_subset # The copy is used because we'll change it afterwards df = self.df[self.colnames_subset].copy() colname = 'ANOVA_FEATURE_FDR' df.loc[self.df[colname] < 0.01, colname] = '<0.01' html = HTMLTable(self.df, 'notused') # Those columns should be links for this in ['FEATURE', 'DRUG_ID', 'ASSOC_ID']: html.add_href(this) for this in ['FEATURE_IC50_effect_size', 'FEATURE_neg_Glass_delta', 'FEATURE_pos_Glass_delta']: html.add_bgcolor(this, cmap_clip, mode='clip', threshold=clip_threshold) # normalise data and annotate with color html.add_bgcolor('FEATURE_delta_MEAN_IC50', cmap_absmax, mode='absmax') html.df.columns = [x.replace("_", " ") for x in html.df.columns] return html.to_html(escape=escape, header=header, index=index, collapse_table=collapse_table, justify='center')
def get_html_table(self, collapse_table=False, clip_threshold=2, index=False, header=True, escape=False, add_href=True): """Return an HTML table for the reports :param add_href: add href to the FEATURE, DRUG ID and ASSOC ID """ cmap_clip = cmap_builder('#ffffff', '#0070FF') cmap_absmax = cmap_builder('green', 'white', 'red') # The copy is used because we'll change it afterwards df = self.df[self.colnames_subset].copy() colname = 'ANOVA_FEATURE_FDR' df.loc[df[colname] < 0.01, colname] = '<0.01' # In the assoc column, we remove the first "a" letter so that # the column is properly sorted by Id but the link should be with the # "a" as prefix df.ASSOC_ID = df.ASSOC_ID.apply(lambda x: int(str(x).replace("a", ""))) html = HTMLTable(df, 'notused') # Those columns should be links if add_href: html.add_href("FEATURE") html.add_href("ASSOC_ID", url="a", suffix=".html") # here url works like a prefix html.add_href("DRUG_ID", url="drug_", suffix=".html") # here url works like a prefix for this in [ 'FEATURE_IC50_effect_size', 'FEATURE_neg_Glass_delta', 'FEATURE_pos_Glass_delta' ]: html.add_bgcolor(this, cmap_clip, mode='clip', threshold=clip_threshold) # normalise data and annotate with color html.add_bgcolor('FEATURE_delta_MEAN_IC50', cmap_absmax, mode='absmax') html.df.columns = [x.replace("_", " ") for x in html.df.columns] return html.to_html(escape=escape, header=header, index=index, collapse_table=collapse_table, justify='center')
def add_cosmic(self): # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html()
def test_htmltable(): df = pd.DataFrame({ 'A': [1, 2, 10], 'B': [1, 10, 2], 'C': [1, 10, 2], 'url': ['A', 'B', 'C'] }) html = HTMLTable(df, 'test') html.add_href('url') html.add_bgcolor('A') html.add_bgcolor('B', mode='clip', threshold=2) html.add_bgcolor('C', mode='max', threshold=2) print(html.to_html())
def add_features(self): # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [ "{}-{}".format(x, get_name(x)) for x in df_drugs.index ] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') if len(self.report.df) == 0: return # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID', url="associations/drug_", suffix=".html") table.df.columns = [ x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns ] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE', url="associations/", suffix=".html") table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False)
def _create_report(self, onweb=True): # The top section with standard information section = """<div> <b>Regression method:</b> %s </br> </div><hr> """ % self.caller.method self.jinja['sections'].append(section) # The main CSV tables with bayes factor and links to each drug ID filename = self.caller.prefix_data + "results.csv" df = pd.read_csv(filename) df['ttest (-log10)'] = -pylab.log10(df['ttest']) # prevents inf to fail in the HTMLTable table = HTMLTable(df) table.add_bgcolor('bayes') table.add_bgcolor('Rp') table.df['drugid'] = [ '<a href="drug_%s.html">%s</a>' % (x, x) for x in table.df['drugid'] ] html = ( "<div><p>This table contains links to all drugs (first column)." " The Rp column contains the coefficient of correlation" " (pearson) found with the regression method for the alpha" " parameter provided in column 3. The alpha value is the optimised" " value obtained using a cross validation (see below)." " The ln_alpha column is just the -log10(alpha) value. The bayes" " factor gives an idea of the significance of the correlation as " " compared to a null distribution. See" ' <a href="http://gdsctools.readthedocs.io/en/master/references.html">' 'gdstools documentation.</a> for details.' "<br>" " Note also that the optimisation of the alpha parameter is" " performed using a cross validation and depends on a few" " parameters such as the range of alpha values, number of " " cross validation, ....</p>") html += table.to_html(index=False) pattern = '<div>%s <p>Download the CSV <a href="%s">file</a></p></div><hr>' pattern = pattern % (html, filename) html = pattern self.jinja['sections'].append(html) # The scatter plot. First the javascript in the header self._set_scatter() # and the section itself html = """ <div class="wrap"> <div class="content"> <center> <canvas id='canvasVolcano' width='800' height='540'></canvas> </center> </div> <div class="clear"> </div> </div> """ self.jinja["sections"].append(html)
class GDSC(GDSCBase): """Alias to ANOVA class with default settings to loop over all TCGA Tissues and comnpanies. Reads 1. Nf Genomic feature files for different TCGA types. 2. a unique DRUG DECODER file 3. a unique IC50 file and perform the Nf analysis saving results in appropriate files. Then split the data for each companies. It also converts tissue names into TCGA names. .. todo:: How to get the GF files First, create all main analysis that include all drugs:: gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt', genomic_feature_pattern='GF*csv') # identifies all genomic features GF* that contains specific TCGA GF gg.run() # This will take hours depending on the number of drugs. # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes # THe PANCAN data set is the largest and takes about 1 hour itself You should now have a directory called **ALL** with about 20 directories for each TCGA GF file. Keep that in a safe place or you will have to restart the analysis Second, split those data just created for each specific proprietary compounds. For instance:: gg.create_data_packages_for_companies(['AZ']) or for all in one go:: gg.create_data_packages_for_companies() Third, create some summary pages:: from gdsctools.gdsc import GDSCDirectorySummary() gs = GDSCDirectorySummary() gs.create_summary_pages('ALL') for company in gg.companies: gs.create_summary_pages(company) The last step is fast but the whole process of analyse and image creation is very long. """ def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", mode='standard'): super(GDSC, self).__init__(genomic_feature_pattern, verbose=True) self.debug = False self.ic50_filename = ic50 self.dd_filename = drug_decode if mode == 'v18': self.ic50 = IC50Cluster(ic50) else: self.ic50 = IC50(ic50) self.drug_decode = DrugDecode(drug_decode) self.settings = ANOVASettings() self.settings.low_memory = True if mode == 'v18': self.settings.FDR_threshold = 35 print( "Those settings will be used (note that low_memory is set " "to True and check the value of FDR_threshold (set to 35 in v18)") print(self.settings) # figure out the cancer types: self.results = {} def run(self): self.mkdir('ALL') # First analyse all case of TCGA + PANCAN once for all and # store all results in a dictionary. self._analyse_all() def _analyse_all(self): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print('================================ Analysing %s data' % tcga) self.mkdir('ALL' + os.sep + tcga) # Computes the ANOVA an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode) self.an = an an.settings = ANOVASettings(**self.settings) an.init() # reset the analysis_type automatically results = an.anova_all() # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an, self.results[tcga]) self.report.settings.savefig = True self.report.settings.directory = 'ALL/' + tcga self.report.settings.analysis_type = tcga self.report.create_html_pages() def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1) def _get_tcga(self): return [x.split("_")[1].split(".")[0] for x in self.gf_filenames] tcga = property(_get_tcga) def _get_companies(self): return [x for x in self.drug_decode.companies if x != 'Commercial'] companies = property(_get_companies) def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep + filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html') # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df def load_results(self): """Find the files results.csv in all TCGA directories""" for tcga in self.tcga: print(tcga) self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep + 'OUTPUT' + os.sep + 'results.csv')
def _create_report(self, onweb=True): # A summary table diag = self.report.diagnostics() table = HTMLTable(diag, 'summary') txt = '' for index, row in diag.iterrows(): if len(row.text) == 0 and len(row.value) == 0: txt += '----<br/>' else: txt += row.text + ": " + str(row.value) + "<br/>" self.jinja['summary'] = txt print('Creating volcano plots') # this can be pretty slow. so keep only 1000 most relevant # values and 1000 random ones to get an idea of the distribution v = VolcanoANOVA(self.report.df, settings=self.settings) v.selector(v.df, 1500, 1500, inplace=True) v.volcano_plot_all() v.savefig_and_js("volcano_all_js") self.jinja['volcano'] = """ <h3></h3> <a href="volcano_all_js.html"> <img alt="volcano plot for all associations" src="volcano_all_js.png"> </a> <br/> <p> A javascript version is available <a href="volcano_all_js.html">here</a> ( or click on the image).</p> """ # MANOVA link N = len(self.report.get_significant_set()) self.jinja['manova'] = """ There were %(N)s significant associations found. All significant associations have been gatherered in the following link: <br/><a href="manova.html">manova results</a>. """ % { 'N': N } # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') # drug summary #not_tested = [x for x in self.report.gdsc.drugIds if x not in # self.report.df.DRUG_ID.unique()] #if len(not_tested) > 0: # not_tested = """%s drugs were not analysed due to # lack of valid data points: """ % len(not_tested) + \ # ", ".join(not_tested) #else: # not_tested = "" not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID') table.df.columns = [ x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns ] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE') table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False) # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html() # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join( [self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv' } filename = os.sep.join([self.settings.directory, 'code', 'rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()