def _create_report(self, onweb=True): # generated pictures and results df = self.run() # Create the table and add it sign = ANOVAResults(df) html_table = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html_table # Main boxplot always included prefix = 'images/ODOF_all' tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature) section = '<img alt="association {0}" src="{0}">\n'.format(tag) if self.factory.settings.include_MSI_factor: prefix = 'ODOF_msi' tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature) section += '<img alt="association {0}" src="{0}">\n'.format(tag) if self.factory.settings.analysis_type == 'PANCAN': prefix = 'ODOF_tissue' tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature) section += '<img alt="association {0}" src="{0}">\n'.format(tag) self.jinja['boxplots'] = section
def _create_report(self, onweb=True): # generated pictures and results df = self.run() odof = self.factory._get_one_drug_one_feature_data( self.drug, self.feature) # Create the table and add it sign = ANOVAResults(df) html_table = sign.get_html_table(escape=False, header=True, index=False, add_href=self.add_href) self.jinja['association_table'] = html_table # Javascript version bx = BoxPlotsJS(odof) self.jinja["boxplot_all_jsdata"] = bx.get_html_association() section = "" if self.factory.settings.include_MSI_factor: self.jinja["boxplot_msi_jsdata"] = bx.get_html_msi() if self.factory.settings.include_media_factor: self.jinja["boxplot_media_jsdata"] = bx.get_html_media() if self.factory.settings.analysis_type == 'PANCAN': self.jinja["boxplot_tissue_jsdata"] = bx.get_html_tissue() self.jinja['boxplots'] = section
def load_results(self): """Find the files results.csv in all TCGA directories""" for tcga in self.tcga: print(tcga) self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep + 'OUTPUT' + os.sep + 'results.csv')
def _create_report(self, onweb=True): self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html self.jinja['volcano_jsdata'] = self.create_pictures()
def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None
def _create_report(self, onweb=True): self.create_pictures() self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html # image section self.jinja['image_filename'] = "images/volcano_{0}".format( self.feature)
def __init__(self, gdsc, df): """.. rubric:: constructor :param : an ANOVA instance. :param directory: where to save the file The HTML filename is stored in the :attr:`filename`, which can be changes (default is manova.html) """ super(HTMLPageMANOVA, self).__init__(filename='manova.html', directory=gdsc.settings.directory, template_filename='manova.html') html = ANOVAResults(df).get_html_table(collapse_table=False) self.jinja['manova'] = html self.jinja['analysis_domain'] = gdsc.settings.analysis_type
def _create_report(self, onweb=True): #self.create_pictures() # add the table self.jinja['synonyms'] = '' self.jinja['brand_name'] = '' self.jinja['conc_min'] = '?' self.jinja['conc_max'] = '?' # Table section self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html # image section self.jinja['volcano_jsdata'] = self.create_pictures()
def __init__(self, report, df, company): """.. rubric:: constructor :param : an ANOVA instance. :param directory: where to save the file The HTML filename is stored in the :attr:`filename`, which can be changes (default is manova.html) """ super(HTMLPageMANOVA, self).__init__( filename='manova.html', directory=report.settings.directory + os.sep + "associations", template_filename='manova.html', init_report=False) html = ANOVAResults(df).get_html_table(collapse_table=False) self.jinja['manova'] = html self.jinja['analysis_domain'] = report.settings.analysis_type self.jinja['resource_path'] = ".." self.jinja["collaborator"] = company
def __init__(self, gdsc, results, sep="\t", drug_decode=None): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all` """ self.figtools = Savefig() self.gdsc = gdsc self.df = ANOVAResults(results).df # this does a copy and sanity check self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) # create some data self._set_sensible_df() # just to create the directory ReportMAIN(directory=self.settings.directory)
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def _create_report(self, onweb=True): # A summary table diag = self.report.diagnostics() table = HTMLTable(diag, 'summary') txt = '' for index, row in diag.iterrows(): if len(row.text) == 0 and len(row.value) == 0: txt += '----<br/>' else: txt += row.text + ": " + str(row.value) + "<br/>" self.jinja['summary'] = txt print('Creating volcano plots') # this can be pretty slow. so keep only 1000 most relevant # values and 1000 random ones to get an idea of the distribution v = VolcanoANOVA(self.report.df, settings=self.settings) v.selector(v.df, 1500, 1500, inplace=True) v.volcano_plot_all() v.savefig_and_js("volcano_all_js") self.jinja['volcano'] = """ <h3></h3> <a href="volcano_all_js.html"> <img alt="volcano plot for all associations" src="volcano_all_js.png"> </a> <br/> <p> A javascript version is available <a href="volcano_all_js.html">here</a> ( or click on the image).</p> """ # MANOVA link N = len(self.report.get_significant_set()) self.jinja['manova'] = """ There were %(N)s significant associations found. All significant associations have been gatherered in the following link: <br/><a href="manova.html">manova results</a>. """ % { 'N': N } # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') # drug summary #not_tested = [x for x in self.report.gdsc.drugIds if x not in # self.report.df.DRUG_ID.unique()] #if len(not_tested) > 0: # not_tested = """%s drugs were not analysed due to # lack of valid data points: """ % len(not_tested) + \ # ", ".join(not_tested) #else: # not_tested = "" not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID') table.df.columns = [ x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns ] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE') table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False) # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html() # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join( [self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv' } filename = os.sep.join([self.settings.directory, 'code', 'rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()
def add_settings(self): # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join( [self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv' } filename = os.sep.join([self.settings.directory, 'code', 'rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()
def anova_all(self, animate=True, drugs=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Loops over all drugs calling :meth:`anova_one_drug` for each drug and concatenating all results together. Note that once all data are gathered, an extra column containing the FDR corrections is added to the dataframe using :meth:`add_pvalues_correction` method. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 give the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) pylab.shuffle(drug_names) if animate is True: pb.animate(0) for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == 'global': df = self.add_pvalues_correction(df) # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this['ANOVA_FEATURE_pval'] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != 'object': df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df, self.settings) res.settings = ANOVASettings(**self.settings) return res
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1)
def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep + filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html') # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df
def __init__(self, ic50, genomic_features=None, drug_decode=None, verbose=True, low_memory=True, set_media_factor=False): """.. rubric:: Constructor :param DataFrame IC50: a dataframe with the IC50. Rows should be the COSMIC identifiers and columns should be the Drug names (or identifiers) :param features: another dataframe with rows as in the IC50 matrix and columns as features. The first 3 columns must be named specifically to hold tissues, MSI (see format). :param drug_decode: a 3 column CSV file with drug's name and targets see :mod:`readers` for more information. :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO" The attribute :attr:`settings` contains specific settings related to the analysis or visulation. """ self.verbose = verbose self._init_called = False # We first need to read the IC50 using a dedicated reader self.ic50 = readers.IC50(ic50) # Create a dictionary version of the data # to be accessed per drug where NA have already been # removed. Each drug is a dictionary with 2 keys: # Y for the data and indices for the cosmicID where # there is an IC50 measured. ic50_parse = self.ic50.df.copy().unstack().dropna() self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index, 'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds]) # Reads features if provided, otherwise use a default data set if genomic_features is None: # Reads default version provided with the package self.features = readers.GenomicFeatures() else: self.features = readers.GenomicFeatures(genomic_features) if self.features.found_media is False and \ set_media_factor is True: if self.verbose: print('Populating MEDIA Factor in the Genomic Feature matrix') self.features.fill_media_factor() #: a CSV with 3 columns used in the report self.read_drug_decode(drug_decode) # create the multiple testing factory used in anova_all() self.multiple_testing = MultipleTesting() # We prune the genomic features by settings the cosmic ids of # the features to be those of the cosmic ids of the IC50. See # readers module. This affectation, prune the features dataframe # automatically. This fails if a cosmic identifier is not # found in the features' cosmic ids, so let us catch the error # before hand to give a unknowns = set(self.ic50.cosmicIds).difference( set(self.features.cosmicIds)) if len(unknowns) > 0: print("WARNING: " + "%s cosmic identifiers in your IC50 " % len(unknowns) + "could not be found in the genomic feature matrix. "+ "They will be dropped. Consider using a user-defined " + "genomic features matrix") self.ic50.drop_cosmic(list(unknowns)) self.features.cosmicIds = self.ic50.cosmicIds #self.cosmicIds = self.ic50.cosmicIds #: an instance of :class:`~gdsctools.settings.ANOVASettings` self.settings = ANOVASettings() self.settings.low_memory = low_memory # alias to all column names to store results # cast to list (Python3). self.column_names = list(ANOVAResults().mapping.keys()) # skip assoc_id for now self._odof_dict = dict([(name, None) for name in self.column_names]) # a cache to store ANOVA results for each drug self.individual_anova = {} # must be called if ic50 or features are changed. self.init()
def _create_summary_pages(self, main_directory, verbose=True, company=None): # Read all directories in tissue_packages directories = glob.glob(main_directory + os.sep + '*') summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[-1] if tcga not in self.tcga: continue if verbose: print(directory, tcga) # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = hits['Unnamed: 0'].unique() results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = results.df.DRUG_ID.unique() else: drug_ids = [] # where to find the DRUG DECODE file. Should # have been copied path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] try: df.sort_values(by="Number of hits", ascending=False, inplace=True) except: df.sort("Number of hits", ascending=False, inplace=True) output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMain(directory=main_directory, filename='index.html', template_filename='datapack_summary.html', mode="summary") # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') self.html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html( collapse_table=False) if company: self.html_page.jinja["collaborator"] = company self.html_page.write() return df