Python ANOVA Examples, gdsctools.anova.ANOVA Python Examples

Example #1

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_anova_summary():
    an = ANOVA(ic50_test)
    # by default  regression includes + msi + feature
    drug_id = 999
    df = an.anova_one_drug_one_feature(drug_id, "ASH1L_mut")

    x = an.anova_pvalues
    x = [x["tissue"], x["msi"], x["feature"]]
    y = [3.210453608523738e-06, 0.14579091345305398, 0.5430736275249095]
    assert_list_almost_equal(x, y, deltas=1e-10)

    an.settings.analysis_type = "COREAD"  # something different from PANCAN
    df = an.anova_one_drug_one_feature(drug_id, "ASH1L_mut")
    x = an.anova_pvalues
    x = [x["msi"], x["feature"]]
    y = [0.262294448831941, 0.30599483315087317]
    assert_list_almost_equal(x, y, deltas=1e-10)

    # now remove also the MSI factor, in which case the tissue must also be
    # removed !
    an.settings.include_MSI_factor = False
    an.settings.analysis_type = "COREAD"
    df = an.anova_one_drug_one_feature(drug_id, "ASH1L_mut")
    x = [an.anova_pvalues["feature"]]
    y = [0.21266050833611852]
    assert_list_almost_equal(x, y, deltas=1e-10)

    assert (df.N_FEATURE_neg == 365).all()

Example #2

0

Show file

File: test_anova.py Project: howard-lightfoot/gdsctools

def test_anova_summary():
    an = ANOVA(ic50_test)
    # by default  regression includes + msi + feature
    drug_id = 'Drug_999_IC50'

    df = an.anova_one_drug_one_feature(drug_id, 'ASH1L_mut')

    x = an.anova_pvalues
    y = [3.210453608523738e-06, 0.14579091345305398, 0.5430736275249095, None]
    assert_list_almost_equal(x, y)

    
    an.settings.analysis_type = 'COREAD' # something different from PANCAN
    df = an.anova_one_drug_one_feature(drug_id, 'ASH1L_mut')
    x = an.anova_pvalues
    y = [0.262294448831941, 0.30599483315087317, None]
    assert_list_almost_equal(x, y)

    # now remove also the MSI factor
    an.settings.include_MSI_factor = False
    df = an.anova_one_drug_one_feature(drug_id, 'ASH1L_mut')
    x = an.anova_pvalues
    y = [0.21266050833611852, None]
    assert_list_almost_equal(x, y)

    assert (df.N_FEATURE_neg == 365).all()

Example #3

0

Show file

File: gdsc.py Project: CancerRxGene/gdsctools

    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode,
                verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init() # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)

Example #4

0

Show file

File: test_anova.py Project: howard-lightfoot/gdsctools

def test_anova_one_drug_one_feature():

    an = ANOVA(ic50_test)

    # test 1 drug
    drug_id = 'Drug_999_IC50'
    df = an.anova_one_drug_one_feature(
        drug_id=drug_id,
        feature_name='ABCB1_mut', show=True)

    control = {'DRUG_ID': {1: drug_id},
        'DRUG_NAME': {1: drug_id},
        'DRUG_TARGET': {1: drug_id},
        'FEATURE': {1: 'ABCB1_mut'},
        'ANOVA_FEATURE_pval': {1: 0.86842684367357359},
        'FEATURE_IC50_T_pval': {1: 0.48586107208790896},
        'FEATURE_IC50_effect_size': {1: 0.31407773405409201},
        'FEATURE_delta_MEAN_IC50': {1: -0.36105662590553411},
        'FEATUREneg_Glass_delta': {1: 0.31296252976074801},
        'FEATUREneg_IC50_sd': {1: 1.1536736560173895},
        'FEATUREneg_logIC50_MEAN': {1: 2.8007757068403043},
        'FEATUREpos_Glass_delta': {1: 0.53754504818376181},
        'FEATUREpos_IC50_sd': {1: 0.67167696386648634},
        'FEATUREpos_logIC50_MEAN': {1: 2.4397190809347702},
        'ANOVA_MSI_pval': {1: 0.14598946672374763},
        'N_FEATURE_neg': {1: 370},
        'N_FEATURE_pos': {1: 5},
        'ANOVA_TISSUE_pval': {1: 3.2808255732569986e-06}}
    control = pd.DataFrame(control)

    assert_list_almost_equal(df,control)

Example #5

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_multicore():

    an = ANOVA(ic50_test)
    results = an.anova_all()

    an2 = ANOVA(ic50_test)
    results2 = an2.anova_all(multicore=2)

    all(results.df.fillna(0) == results2.df.fillna(0))

Example #6

0

Show file

File: test_anova.py Project: howard-lightfoot/gdsctools

def test_anova_all():
    an = ANOVA(ic50_test)
    # slow, let us cut the features to keep only tenish
    # this is a trick but would be nice to have this in the API
    features = an.features.df
    features = features[features.columns[0:12]]

    an = ANOVA(ic50_test, features)
    results = an.anova_all()
    assert_almost_equal( results.df['ANOVA_FEATURE_FDR'].sum(), 
            10312.23061065521, 6)

    an2 = ANOVA(ic50_test, features, set_media_factor=True)
    results = an2.anova_all()
    assert_almost_equal( results.df['ANOVA_FEATURE_FDR'].sum(), 
            10238.529313503008, 6)

Example #7

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_anova_all():
    an = ANOVA(ic50_test)
    # slow, let us cut the features to keep only ten-ish values
    # this is a trick but would be nice to have this in the API
    features = an.features.df
    features = features[features.columns[0:12]]

    an = ANOVA(ic50_test, features)
    results = an.anova_all()
    results  # test __repr__
    results.volcano()
    results.barplot_effect_size()

    assert_almost_equal(results.df["ANOVA_FEATURE_FDR"].sum(), 10312.23061065521, delta=1e-6)

    an2 = ANOVA(ic50_test, features, set_media_factor=True)
    results = an2.anova_all()
    assert_almost_equal(results.df["ANOVA_FEATURE_FDR"].sum(), 10238.529313503008, delta=1e-6)

Example #8

0

Show file

File: multicore.py Project: howard-lightfoot/gdsctools

def multicore_anova(ic50, genomic_features, drug_decode=None, maxcpu=2,
                    sampling=0):
    """Using 4 cores, the entire analysis took 15 minutes using
    4 CPUs (16 Oct 2015).

    :param ic50: a filename or :class:`IC50` instance.
    :return: the anova instance itself (not the results); see example below.

    ::

        from gdsctool qq qs.anova import multicore
        master = multicore(dataset, maxcpu=2)
        results = master.anova_all()

        from gdsctools import ANOVAReport()
        report = ANOVAReport(master, results)
        report.create_html_pages(0

    .. warning:: experimental. Seems to work but sometimes hangs forever.
    """
    print("experimental code to run the analysis with several cores")
    print("May takes lots or resources and slow down your system")
    t1 = time.time()
    master = ANOVA(ic50, genomic_features=genomic_features,
                   drug_decode=drug_decode, low_memory=True)
    master.sampling = sampling

    drugs = master.ic50.drugIds

    t = MultiProcessing(maxcpu=maxcpu)
    # add all jobs (one per drug)
    for i, drug in enumerate(drugs):
        t.add_job(analyse_one_drug, master, drug)
    t.run()

    # populate the ANOVA instance with the results
    for this in t.results:
        drug = this[0]
        result = this[1]
        master.individual_anova[drug] = result

    print("\nTook " + str(time.time() - t1) + "seconds.")
    return master

Example #9

0

Show file

File: test_anova_html.py Project: howard-lightfoot/gdsctools

def test_html():
    # same as above, could factorise
    an = ANOVA(ic50_test)
    features = an.features.df
    features = features[features.columns[0:30]]
    an = ANOVA(ic50_test, features)
    #an.settings.include_media_factor = False
    results = an.anova_all()

    assert len(results.df) == 302 

    r = ANOVAReport(gdsc=an, results=results)
    assert len(r.get_significant_set()) == 3
    # long but should cover everthinh.
    try:
        r.create_html_pages()
    except Exception as err:
        raise err
    finally:
        import shutil
        shutil.rmtree('html_gdsc_anova')

Example #10

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_anova_one_drug_one_feature():

    an = ANOVA(ic50_test)
    # test 1 drug
    drug_id = 999
    df = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut", show=True)
    df["DRUG_ID"] = drug_id
    df["DRUG_NAME"] = drug_id
    df["DRUG_TARGET"] = drug_id
    df["ANOVA_MEDIA_pval"] = -1

    control = {
        "DRUG_ID": {1: drug_id},
        "DRUG_NAME": {1: drug_id},
        "DRUG_TARGET": {1: drug_id},
        "FEATURE": {1: "ABCB1_mut"},
        "ANOVA_FEATURE_pval": {1: 0.86842684367357359},
        "FEATURE_IC50_T_pval": {1: 0.48586107208790896},
        "FEATURE_IC50_effect_size": {1: 0.31407773405409201},
        "FEATURE_delta_MEAN_IC50": {1: -0.36105662590553411},
        #        'FEATUREneg_Glass_delta': {1: 0.31296252976074801},
        #        'FEATUREneg_IC50_sd': {1: 1.1536736560173895},
        "FEATUREneg_logIC50_MEAN": {1: 2.8007757068403043},
        #        'FEATUREpos_Glass_delta': {1: 0.53754504818376181},
        #        'FEATUREpos_IC50_sd': {1: 0.67167696386648634},
        "FEATUREpos_logIC50_MEAN": {1: 2.4397190809347702},
        "ANOVA_MSI_pval": {1: 0.14598946672374763},
        "N_FEATURE_neg": {1: 370},
        "N_FEATURE_pos": {1: 5},
        "ANOVA_MEDIA_pval": {1: -1},
        "ANOVA_TISSUE_pval": {1: 3.2808255732569986e-06},
    }
    control = pd.DataFrame(control)
    for this in control.columns:
        if this in ["FEATURE_delta_MEAN_IC50", "FEATUREneg_logIC50_MEAN", "FEATUREpos_logIC50_MEAN"]:
            pass
        else:
            assert_almost_equal(df[this].values[0], control[this].values[0])

Example #11

0

Show file

File: gdsc.py Project: howard-lightfoot/gdsctools

    def _analyse_all(self):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print('================================ Analysing %s data' % tcga)

            self.mkdir('ALL' + os.sep + tcga)

            # Computes the ANOVA
            an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode)
            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.init() # reset the analysis_type automatically
            results = an.anova_all()

            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an, self.results[tcga])
            self.report.settings.savefig = True
            self.report.settings.directory = 'ALL/' + tcga
            self.report.settings.analysis_type = tcga
            self.report.create_html_pages()

Example #12

0

Show file

File: test_anova.py Project: howard-lightfoot/gdsctools

def test_odof_with_without_media():

    gdsc = ANOVA(ic50_test)
    _res = gdsc.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut')
    dd1 = gdsc._get_anova_summary(gdsc.data_lm, output='dict')
    assert_list_almost_equal([dd1['feature'], dd1['msi'], dd1['tissue']], 
        [1.5750735472022118e-58,  0.025902887791637515, 
            5.541879283763767e-44])

    gdsc = ANOVA(ic50_test, set_media_factor=True)
    _res = gdsc.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut')
    dd2 = gdsc._get_anova_summary(gdsc.data_lm, output='dict')
    assert_list_almost_equal([dd2['feature'], dd2['media'], dd2['msi'],
    dd2['tissue']], [ 2.9236500715529455e-58, 0.7762487502315283,
         0.023777744527686766, 1.5729157319290974e-44])

Example #13

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_odof_with_without_media():

    gdsc = ANOVA(ic50_test)
    _res = gdsc.anova_one_drug_one_feature(1047, "TP53_mut")
    odof = gdsc._get_one_drug_one_feature_data(1047, "TP53_mut")
    dd1 = gdsc._get_anova_summary(gdsc.data_lm, output="dict", odof=odof)

    assert_almost_equal(dd1["feature"], 1.5750735472022118e-58, delta=1e-64)
    assert_almost_equal(dd1["msi"], 0.025902887791637515, delta=1e-10)

    # Change in Nov 2016
    # assert_almost_equal(dd1['tissue'], 5.541879283763767e-44, delta=1e-50)
    assert_almost_equal(dd1["tissue"], 1.0258702741509e-44, delta=1e-50)

    gdsc = ANOVA(ic50_test, set_media_factor=True)
    _res = gdsc.anova_one_drug_one_feature(1047, "TP53_mut")
    odof = gdsc._get_one_drug_one_feature_data(1047, "TP53_mut")
    dd2 = gdsc._get_anova_summary(gdsc.data_lm, output="dict", odof=odof)

    assert_almost_equal(dd2["feature"], 2.9236500715529455e-58, delta=1e-64)
    assert_almost_equal(dd2["media"], 0.7762487502315283, delta=1e-10)
    assert_almost_equal(dd2["msi"], 0.023777744527686766, delta=1e-10)
    assert_almost_equal(dd2["tissue"], 1.5729157319290974e-44, delta=1e-50)

Example #14

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_compare_formula_vs_gdsc():
    # TISSUE + MSI + feature
    an = ANOVA(ic50_test)
    drug_id = 999
    df1 = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut")
    an.settings.regression_formula = "Y ~ C(tissue) + C(msi) + feature"
    df2 = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut")

    assert all(df1.fillna(1).values[0] == df2.fillna(1).values[0])

    # MSI + feature only
    an = ANOVA(ic50_test)
    an.settings.analysis_type = "breast"
    df1 = an.anova_one_drug_one_feature(drug_id=drug_id, feature_name="ABCB1_mut")
    an.settings.regression_formula = "Y ~ C(msi) + feature"
    df2 = an.anova_one_drug_one_feature(an.drugIds[2], an.feature_names[0])
    assert all(df1.fillna(1) == df2.fillna(1))

Example #15

0

Show file

File: test_anova.py Project: CancerRxGene/gdsctools

def test_set_cancer_type():
    an = ANOVA(gdsctools_data("IC50_v17.csv.gz"))
    an.set_cancer_type("breast")
    assert_list_almost_equal([an.ic50.df.sum().sum()], [27721.255627472943])

Example #16

0

Show file

def test_anova_one_drug_one_feature():

    an = ANOVA(ic50_test)

    # test 1 drug
    drug_id = 'Drug_999_IC50'
    df = an.anova_one_drug_one_feature(drug_id=drug_id,
                                       feature_name='ABCB1_mut',
                                       show=True)

    control = {
        'DRUG_ID': {
            1: drug_id
        },
        'DRUG_NAME': {
            1: drug_id
        },
        'DRUG_TARGET': {
            1: drug_id
        },
        'FEATURE': {
            1: 'ABCB1_mut'
        },
        'ANOVA_FEATURE_pval': {
            1: 0.86842684367357359
        },
        'FEATURE_IC50_T_pval': {
            1: 0.48586107208790896
        },
        'FEATURE_IC50_effect_size': {
            1: 0.31407773405409201
        },
        'FEATURE_delta_MEAN_IC50': {
            1: -0.36105662590553411
        },
        'FEATUREneg_Glass_delta': {
            1: 0.31296252976074801
        },
        'FEATUREneg_IC50_sd': {
            1: 1.1536736560173895
        },
        'FEATUREneg_logIC50_MEAN': {
            1: 2.8007757068403043
        },
        'FEATUREpos_Glass_delta': {
            1: 0.53754504818376181
        },
        'FEATUREpos_IC50_sd': {
            1: 0.67167696386648634
        },
        'FEATUREpos_logIC50_MEAN': {
            1: 2.4397190809347702
        },
        'ANOVA_MSI_pval': {
            1: 0.14598946672374763
        },
        'N_FEATURE_neg': {
            1: 370
        },
        'N_FEATURE_pos': {
            1: 5
        },
        'ANOVA_TISSUE_pval': {
            1: 3.2808255732569986e-06
        }
    }
    control = pd.DataFrame(control)

    assert_list_almost_equal(df, control)

Example #17

0

Show file

def test_anova_one_drug():
    # test entire drug across all fearures
    an = ANOVA(ic50_test)
    df = an.anova_one_drug('Drug_999_IC50')

Example #18

0

Show file

File: test_anova.py Project: howard-lightfoot/gdsctools

def test_anova_one_drug():
    # test entire drug across all fearures
    an = ANOVA(ic50_test)
    df = an.anova_one_drug('Drug_999_IC50')

Example #19

0

Show file

File: gdsc.py Project: CancerRxGene/gdsctools

    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError("Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(purple("\n=========== Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)


                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in results.df.DRUG_ID]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50, gf_filename, drug_decode_company,
                    verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an, results,
                        drug_decode=drug_decode_company,
                        verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()

Example #20

0

Show file

File: gdsc.py Project: howard-lightfoot/gdsctools

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)

Example #21

0

Show file

File: gdsc.py Project: howard-lightfoot/gdsctools

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in drug_ids_in_results]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) 

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from 
                    # the analysis made in ALL 
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep )
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga,
                                os.sep, os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep,
                                tcga, os.sep , os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        pb.animate(i+1)

Example #22

0

Show file

    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError(
                "Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(
                purple("\n=========== Analysing company %s out of %s (%s)" %
                       (ii + 1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (
                        self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)

                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in results.df.DRUG_ID
                ]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50,
                           gf_filename,
                           drug_decode_company,
                           verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an,
                                          results,
                                          drug_decode=drug_decode_company,
                                          verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()

Example #23

0

Show file

def test_set_cancer_type():
    an = ANOVA(gdsctools_data("IC50_v17.csv.gz"))
    an.set_cancer_type("breast")
    assert_list_almost_equal([an.ic50.df.sum().sum()], [27721.255627472943])