コード例 #1
0
    def anova_all(self, animate=True, drugs=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Loops over all drugs calling :meth:`anova_one_drug` for each
        drug and concatenating all results together. Note that once all
        data are gathered, an extra column containing the FDR corrections
        is added to the dataframe using :meth:`add_pvalues_correction`
        method. An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 give the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        pylab.shuffle(drug_names)
        if animate is True:
            pb.animate(0)

        for i, drug_name in enumerate(drug_names):
            if drug_name in self.individual_anova.keys():
                pass
            else:
                res = self.anova_one_drug(drug_name,
                                          animate=False,
                                          output='dataframe')
                self.individual_anova[drug_name] = res
            if animate is True:
                pb.animate(i + 1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values('ANOVA_FEATURE_pval', inplace=True)
        except:
            df.sort('ANOVA_FEATURE_pval', inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level == 'global':
            df = self.add_pvalues_correction(df)

        # insert a unique identifier as first column
        df.insert(0, 'ASSOC_ID', range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        results = ANOVAResults()
        results.df = df
        results.settings = ANOVASettings(**self.settings)
        return results
コード例 #2
0
    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError(
                "Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(
                purple("\n=========== Analysing company %s out of %s (%s)" %
                       (ii + 1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (
                        self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)

                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in results.df.DRUG_ID
                ]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50,
                           gf_filename,
                           drug_decode_company,
                           verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an,
                                          results,
                                          drug_decode=drug_decode_company,
                                          verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()
コード例 #3
0
ファイル: anova.py プロジェクト: CancerRxGene/gdsctools
    def anova_all(self, animate=True, drugs=None, multicore=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Calls :meth:`anova_one_drug` for each drug and concatenate all
        results together. Note that once all data are gathered,
        :meth:`add_pvalues_correction` is called to fill a new column
        with FDR corrections.

        An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 gives the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        if self.verbose and len(self.individual_anova):
            print("Reusing some results from the buffer. " "To reset the buffer, call reset_buffer() method")
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        # pylab.shuffle(drug_names) # ? why

        if animate is True:
            pb.animate(0)

        if multicore:
            # Note that here, we do not use the buffer
            multicore_analysis(self, drug_names, multicore)
        else:

            for i, drug_name in enumerate(drug_names):
                if drug_name in self.individual_anova.keys():
                    pass
                else:
                    res = self.anova_one_drug(drug_name, animate=False, output="dataframe")
                    self.individual_anova[drug_name] = res
                if animate is True:
                    pb.animate(i + 1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values("ANOVA_FEATURE_pval", inplace=True)
        except:
            df.sort("ANOVA_FEATURE_pval", inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level == "global":
            df = self.add_pvalues_correction(df)
        else:
            pass

        # insert a unique identifier as first column
        df.insert(0, "ASSOC_ID", range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        results = ANOVAResults()
        results.df = df
        results.settings = ANOVASettings(**self.settings)
        return results
コード例 #4
0
ファイル: gdsc.py プロジェクト: CancerRxGene/gdsctools
    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError("Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(purple("\n=========== Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)


                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in results.df.DRUG_ID]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50, gf_filename, drug_decode_company,
                    verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an, results,
                        drug_decode=drug_decode_company,
                        verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()
コード例 #5
0
ファイル: gdsc.py プロジェクト: howard-lightfoot/gdsctools
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in drug_ids_in_results]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) 

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from 
                    # the analysis made in ALL 
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep )
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga,
                                os.sep, os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep,
                                tcga, os.sep , os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        pb.animate(i+1)
コード例 #6
0
ファイル: gdsc.py プロジェクト: howard-lightfoot/gdsctools
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)