def test_anova_summary(): an = ANOVA(ic50_test) # by default regression includes + msi + feature drug_id = 999 df = an.anova_one_drug_one_feature(drug_id, "ASH1L_mut") x = an.anova_pvalues x = [x["tissue"], x["msi"], x["feature"]] y = [3.210453608523738e-06, 0.14579091345305398, 0.5430736275249095] assert_list_almost_equal(x, y, deltas=1e-10) an.settings.analysis_type = "COREAD" # something different from PANCAN df = an.anova_one_drug_one_feature(drug_id, "ASH1L_mut") x = an.anova_pvalues x = [x["msi"], x["feature"]] y = [0.262294448831941, 0.30599483315087317] assert_list_almost_equal(x, y, deltas=1e-10) # now remove also the MSI factor, in which case the tissue must also be # removed ! an.settings.include_MSI_factor = False an.settings.analysis_type = "COREAD" df = an.anova_one_drug_one_feature(drug_id, "ASH1L_mut") x = [an.anova_pvalues["feature"]] y = [0.21266050833611852] assert_list_almost_equal(x, y, deltas=1e-10) assert (df.N_FEATURE_neg == 365).all()
def test_anova_summary(): an = ANOVA(ic50_test) # by default regression includes + msi + feature drug_id = 'Drug_999_IC50' df = an.anova_one_drug_one_feature(drug_id, 'ASH1L_mut') x = an.anova_pvalues y = [3.210453608523738e-06, 0.14579091345305398, 0.5430736275249095, None] assert_list_almost_equal(x, y) an.settings.analysis_type = 'COREAD' # something different from PANCAN df = an.anova_one_drug_one_feature(drug_id, 'ASH1L_mut') x = an.anova_pvalues y = [0.262294448831941, 0.30599483315087317, None] assert_list_almost_equal(x, y) # now remove also the MSI factor an.settings.include_MSI_factor = False df = an.anova_one_drug_one_feature(drug_id, 'ASH1L_mut') x = an.anova_pvalues y = [0.21266050833611852, None] assert_list_almost_equal(x, y) assert (df.N_FEATURE_neg == 365).all()
def _analyse_all(self, multicore=None): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(purple('======================== Analysing %s data' % tcga)) self.mkdir(self.main_directory + os.sep + tcga) # Computes the ANOVA try: self.ic50 = IC50(self.ic50_filename) except: print("Clustering IC50 (v18 released data ?)") self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False) if self.test is True: an.features.df = an.features.df[an.features.df.columns[0:15]] self.an = an an.settings = ANOVASettings(**self.settings) an.settings.analysis_type = tcga an.init() # This reset the directory results = an.anova_all(multicore=multicore) an.settings.directory = self.main_directory + os.sep + tcga # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an) self.report.settings.savefig = True self.report.create_html_pages(onweb=False)
def test_anova_one_drug_one_feature(): an = ANOVA(ic50_test) # test 1 drug drug_id = 'Drug_999_IC50' df = an.anova_one_drug_one_feature( drug_id=drug_id, feature_name='ABCB1_mut', show=True) control = {'DRUG_ID': {1: drug_id}, 'DRUG_NAME': {1: drug_id}, 'DRUG_TARGET': {1: drug_id}, 'FEATURE': {1: 'ABCB1_mut'}, 'ANOVA_FEATURE_pval': {1: 0.86842684367357359}, 'FEATURE_IC50_T_pval': {1: 0.48586107208790896}, 'FEATURE_IC50_effect_size': {1: 0.31407773405409201}, 'FEATURE_delta_MEAN_IC50': {1: -0.36105662590553411}, 'FEATUREneg_Glass_delta': {1: 0.31296252976074801}, 'FEATUREneg_IC50_sd': {1: 1.1536736560173895}, 'FEATUREneg_logIC50_MEAN': {1: 2.8007757068403043}, 'FEATUREpos_Glass_delta': {1: 0.53754504818376181}, 'FEATUREpos_IC50_sd': {1: 0.67167696386648634}, 'FEATUREpos_logIC50_MEAN': {1: 2.4397190809347702}, 'ANOVA_MSI_pval': {1: 0.14598946672374763}, 'N_FEATURE_neg': {1: 370}, 'N_FEATURE_pos': {1: 5}, 'ANOVA_TISSUE_pval': {1: 3.2808255732569986e-06}} control = pd.DataFrame(control) assert_list_almost_equal(df,control)
def test_multicore(): an = ANOVA(ic50_test) results = an.anova_all() an2 = ANOVA(ic50_test) results2 = an2.anova_all(multicore=2) all(results.df.fillna(0) == results2.df.fillna(0))
def test_anova_all(): an = ANOVA(ic50_test) # slow, let us cut the features to keep only tenish # this is a trick but would be nice to have this in the API features = an.features.df features = features[features.columns[0:12]] an = ANOVA(ic50_test, features) results = an.anova_all() assert_almost_equal( results.df['ANOVA_FEATURE_FDR'].sum(), 10312.23061065521, 6) an2 = ANOVA(ic50_test, features, set_media_factor=True) results = an2.anova_all() assert_almost_equal( results.df['ANOVA_FEATURE_FDR'].sum(), 10238.529313503008, 6)
def test_anova_all(): an = ANOVA(ic50_test) # slow, let us cut the features to keep only ten-ish values # this is a trick but would be nice to have this in the API features = an.features.df features = features[features.columns[0:12]] an = ANOVA(ic50_test, features) results = an.anova_all() results # test __repr__ results.volcano() results.barplot_effect_size() assert_almost_equal(results.df["ANOVA_FEATURE_FDR"].sum(), 10312.23061065521, delta=1e-6) an2 = ANOVA(ic50_test, features, set_media_factor=True) results = an2.anova_all() assert_almost_equal(results.df["ANOVA_FEATURE_FDR"].sum(), 10238.529313503008, delta=1e-6)
def multicore_anova(ic50, genomic_features, drug_decode=None, maxcpu=2, sampling=0): """Using 4 cores, the entire analysis took 15 minutes using 4 CPUs (16 Oct 2015). :param ic50: a filename or :class:`IC50` instance. :return: the anova instance itself (not the results); see example below. :: from gdsctool qq qs.anova import multicore master = multicore(dataset, maxcpu=2) results = master.anova_all() from gdsctools import ANOVAReport() report = ANOVAReport(master, results) report.create_html_pages(0 .. warning:: experimental. Seems to work but sometimes hangs forever. """ print("experimental code to run the analysis with several cores") print("May takes lots or resources and slow down your system") t1 = time.time() master = ANOVA(ic50, genomic_features=genomic_features, drug_decode=drug_decode, low_memory=True) master.sampling = sampling drugs = master.ic50.drugIds t = MultiProcessing(maxcpu=maxcpu) # add all jobs (one per drug) for i, drug in enumerate(drugs): t.add_job(analyse_one_drug, master, drug) t.run() # populate the ANOVA instance with the results for this in t.results: drug = this[0] result = this[1] master.individual_anova[drug] = result print("\nTook " + str(time.time() - t1) + "seconds.") return master
def test_html(): # same as above, could factorise an = ANOVA(ic50_test) features = an.features.df features = features[features.columns[0:30]] an = ANOVA(ic50_test, features) #an.settings.include_media_factor = False results = an.anova_all() assert len(results.df) == 302 r = ANOVAReport(gdsc=an, results=results) assert len(r.get_significant_set()) == 3 # long but should cover everthinh. try: r.create_html_pages() except Exception as err: raise err finally: import shutil shutil.rmtree('html_gdsc_anova')
def test_anova_one_drug_one_feature(): an = ANOVA(ic50_test) # test 1 drug drug_id = 999 df = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut", show=True) df["DRUG_ID"] = drug_id df["DRUG_NAME"] = drug_id df["DRUG_TARGET"] = drug_id df["ANOVA_MEDIA_pval"] = -1 control = { "DRUG_ID": {1: drug_id}, "DRUG_NAME": {1: drug_id}, "DRUG_TARGET": {1: drug_id}, "FEATURE": {1: "ABCB1_mut"}, "ANOVA_FEATURE_pval": {1: 0.86842684367357359}, "FEATURE_IC50_T_pval": {1: 0.48586107208790896}, "FEATURE_IC50_effect_size": {1: 0.31407773405409201}, "FEATURE_delta_MEAN_IC50": {1: -0.36105662590553411}, # 'FEATUREneg_Glass_delta': {1: 0.31296252976074801}, # 'FEATUREneg_IC50_sd': {1: 1.1536736560173895}, "FEATUREneg_logIC50_MEAN": {1: 2.8007757068403043}, # 'FEATUREpos_Glass_delta': {1: 0.53754504818376181}, # 'FEATUREpos_IC50_sd': {1: 0.67167696386648634}, "FEATUREpos_logIC50_MEAN": {1: 2.4397190809347702}, "ANOVA_MSI_pval": {1: 0.14598946672374763}, "N_FEATURE_neg": {1: 370}, "N_FEATURE_pos": {1: 5}, "ANOVA_MEDIA_pval": {1: -1}, "ANOVA_TISSUE_pval": {1: 3.2808255732569986e-06}, } control = pd.DataFrame(control) for this in control.columns: if this in ["FEATURE_delta_MEAN_IC50", "FEATUREneg_logIC50_MEAN", "FEATUREpos_logIC50_MEAN"]: pass else: assert_almost_equal(df[this].values[0], control[this].values[0])
def _analyse_all(self): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print('================================ Analysing %s data' % tcga) self.mkdir('ALL' + os.sep + tcga) # Computes the ANOVA an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode) self.an = an an.settings = ANOVASettings(**self.settings) an.init() # reset the analysis_type automatically results = an.anova_all() # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an, self.results[tcga]) self.report.settings.savefig = True self.report.settings.directory = 'ALL/' + tcga self.report.settings.analysis_type = tcga self.report.create_html_pages()
def test_odof_with_without_media(): gdsc = ANOVA(ic50_test) _res = gdsc.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut') dd1 = gdsc._get_anova_summary(gdsc.data_lm, output='dict') assert_list_almost_equal([dd1['feature'], dd1['msi'], dd1['tissue']], [1.5750735472022118e-58, 0.025902887791637515, 5.541879283763767e-44]) gdsc = ANOVA(ic50_test, set_media_factor=True) _res = gdsc.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut') dd2 = gdsc._get_anova_summary(gdsc.data_lm, output='dict') assert_list_almost_equal([dd2['feature'], dd2['media'], dd2['msi'], dd2['tissue']], [ 2.9236500715529455e-58, 0.7762487502315283, 0.023777744527686766, 1.5729157319290974e-44])
def test_odof_with_without_media(): gdsc = ANOVA(ic50_test) _res = gdsc.anova_one_drug_one_feature(1047, "TP53_mut") odof = gdsc._get_one_drug_one_feature_data(1047, "TP53_mut") dd1 = gdsc._get_anova_summary(gdsc.data_lm, output="dict", odof=odof) assert_almost_equal(dd1["feature"], 1.5750735472022118e-58, delta=1e-64) assert_almost_equal(dd1["msi"], 0.025902887791637515, delta=1e-10) # Change in Nov 2016 # assert_almost_equal(dd1['tissue'], 5.541879283763767e-44, delta=1e-50) assert_almost_equal(dd1["tissue"], 1.0258702741509e-44, delta=1e-50) gdsc = ANOVA(ic50_test, set_media_factor=True) _res = gdsc.anova_one_drug_one_feature(1047, "TP53_mut") odof = gdsc._get_one_drug_one_feature_data(1047, "TP53_mut") dd2 = gdsc._get_anova_summary(gdsc.data_lm, output="dict", odof=odof) assert_almost_equal(dd2["feature"], 2.9236500715529455e-58, delta=1e-64) assert_almost_equal(dd2["media"], 0.7762487502315283, delta=1e-10) assert_almost_equal(dd2["msi"], 0.023777744527686766, delta=1e-10) assert_almost_equal(dd2["tissue"], 1.5729157319290974e-44, delta=1e-50)
def test_compare_formula_vs_gdsc(): # TISSUE + MSI + feature an = ANOVA(ic50_test) drug_id = 999 df1 = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut") an.settings.regression_formula = "Y ~ C(tissue) + C(msi) + feature" df2 = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut") assert all(df1.fillna(1).values[0] == df2.fillna(1).values[0]) # MSI + feature only an = ANOVA(ic50_test) an.settings.analysis_type = "breast" df1 = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut") an.settings.regression_formula = "Y ~ C(msi) + feature" df2 = an.anova_one_drug_one_feature(an.drugIds[2], an.feature_names[0]) assert all(df1.fillna(1) == df2.fillna(1))
def test_set_cancer_type(): an = ANOVA(gdsctools_data("IC50_v17.csv.gz")) an.set_cancer_type("breast") assert_list_almost_equal([an.ic50.df.sum().sum()], [27721.255627472943])
def test_anova_one_drug_one_feature(): an = ANOVA(ic50_test) # test 1 drug drug_id = 'Drug_999_IC50' df = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name='ABCB1_mut', show=True) control = { 'DRUG_ID': { 1: drug_id }, 'DRUG_NAME': { 1: drug_id }, 'DRUG_TARGET': { 1: drug_id }, 'FEATURE': { 1: 'ABCB1_mut' }, 'ANOVA_FEATURE_pval': { 1: 0.86842684367357359 }, 'FEATURE_IC50_T_pval': { 1: 0.48586107208790896 }, 'FEATURE_IC50_effect_size': { 1: 0.31407773405409201 }, 'FEATURE_delta_MEAN_IC50': { 1: -0.36105662590553411 }, 'FEATUREneg_Glass_delta': { 1: 0.31296252976074801 }, 'FEATUREneg_IC50_sd': { 1: 1.1536736560173895 }, 'FEATUREneg_logIC50_MEAN': { 1: 2.8007757068403043 }, 'FEATUREpos_Glass_delta': { 1: 0.53754504818376181 }, 'FEATUREpos_IC50_sd': { 1: 0.67167696386648634 }, 'FEATUREpos_logIC50_MEAN': { 1: 2.4397190809347702 }, 'ANOVA_MSI_pval': { 1: 0.14598946672374763 }, 'N_FEATURE_neg': { 1: 370 }, 'N_FEATURE_pos': { 1: 5 }, 'ANOVA_TISSUE_pval': { 1: 3.2808255732569986e-06 } } control = pd.DataFrame(control) assert_list_almost_equal(df, control)
def test_anova_one_drug(): # test entire drug across all fearures an = ANOVA(ic50_test) df = an.anova_one_drug('Drug_999_IC50')
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError("Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print(purple("\n=========== Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1)
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [True if x in drug_decode_company.df.index else False for x in drug_ids_in_results] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep ) cmd = "cp %s%s %s" % (source, filename, dest ) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep , os.sep) cmd = "cp %s%s %s" % (source, filename, dest ) shellcmd(cmd, verbose=False) pb.animate(i+1)
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()