def get_html_table(self, collapse_table=False, clip_threshold=2,
            index=False, header=True, escape=False):
        cmap_clip = cmap_builder('#ffffff', '#0070FF')
        cmap_absmax = cmap_builder('green', 'white', 'red')

        columns = ANOVAResults().colnames_subset

        # The copy is used because we'll change it afterwards
        df = self.df[self.colnames_subset].copy()

        colname = 'ANOVA_FEATURE_FDR'

        df.loc[self.df[colname] < 0.01, colname] = '<0.01'

        html = HTMLTable(self.df, 'notused')
        # Those columns should be links
        for this in ['FEATURE', 'DRUG_ID', 'ASSOC_ID']:
            html.add_href(this)

        for this in ['FEATURE_IC50_effect_size', 'FEATURE_neg_Glass_delta',
                'FEATURE_pos_Glass_delta']:
            html.add_bgcolor(this, cmap_clip, mode='clip',
                    threshold=clip_threshold)

        # normalise data and annotate with color
        html.add_bgcolor('FEATURE_delta_MEAN_IC50', cmap_absmax,
            mode='absmax')

        html.df.columns = [x.replace("_", " ") for x in html.df.columns]
        return html.to_html(escape=escape, header=header, index=index,
                collapse_table=collapse_table, justify='center')
Example #2
0
def test_htmltable():
    df = pd.DataFrame({
        'A':[1,2,10], 
        'B':[1,10,2], 
        'C':[1,10,2], 
        'url':['A', 'B', 'C']})
    html = HTMLTable(df, 'test')
    html.add_href('url')
    html.add_bgcolor('A')
    html.add_bgcolor('B', mode='clip', threshold=2)
    html.add_bgcolor('C', mode='max', threshold=2)
    print(html.to_html())
Example #3
0
    def add_cosmic(self):
        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()
Example #4
0
    def add_cosmic(self):
        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()
Example #5
0
    def add_summary(self):
        # A summary table
        diag = self.report.diagnostics()
        table = HTMLTable(diag, 'summary')
        txt = ''

        for index, row in diag.iterrows():
            if len(row.text) == 0 and len(row.value) == 0:
                txt += '----<br/>'
            else:
                txt += row.text + ": " + str(row.value) + "<br/>"
        self.jinja['summary'] = txt
    def _create_report(self, onweb=True):
        section = """<div>
        <b>Regression method:</b> %s </br>
        </div><hr>
        """ % self.caller.method
        self.jinja['sections'].append(section)

        filename = self.caller.prefix + "results.csv"
        df = pd.read_csv(filename)
        df['ttest (-log10)'] = -pylab.log10(df['ttest'])
        # prevents inf to fail in the HTMLTable
        table = HTMLTable(df)
        table.add_bgcolor('bayes')
        table.add_bgcolor('Rp')
        table.df['drugid'] = ['<a href="drug_%s.html">%s</a>' % (x,x)
                          for x in table.df['drugid']]

        html = ("<div><p>This table contains links to all drugs. The Rp columns"
                " contains the coefficient of correlation found with the"
                " method for the give alpha parameter. The ln_alpha column"
                " is just the -log10(alpha) value. The bayes and ttest columns"
                " gives an idea of the significance of the correlation as "
                " compared to a null distribution.</p>")
        html += table.to_html(index=False) +"</div>"
        self.jinja['sections'].append(html)

        filename = self.caller.prefix + "scatter_plot.png"
        html = "<hr><div>"
        html += "<img src=%s></img>" % filename
        html += "/<div>"

        self.jinja['sections'].append(html)

        # The scatter plot. First the javascript in the header
        self._set_scatter()
        # and the section itself
        html = """
         <div class="wrap">
          <div class="content">
              <center>
                  <canvas id='canvasVolcano' width='800' height='540'></canvas>
              </center>
          </div>
          <div class="clear">&nbsp;</div>
        </div>
        """
        self.jinja["sections"].append(html)
Example #7
0
def test_htmltable():
    df = pd.DataFrame({
        'A': [1, 2, 10],
        'B': [1, 10, 2],
        'C': [1, 10, 2],
        'url': ['A', 'B', 'C']
    })
    html = HTMLTable(df, 'test')
    html.add_href('url')
    html.add_bgcolor('A')
    html.add_bgcolor('B', mode='clip', threshold=2)
    html.add_bgcolor('C', mode='max', threshold=2)
    print(html.to_html())
Example #8
0
    def _create_report(self, onweb=True):

        # The top section with standard information
        section = """<div>
        <b>Regression method:</b> %s </br>
        </div><hr>
        """ % self.caller.method
        self.jinja['sections'].append(section)

        # The main CSV tables with bayes factor and links to each drug ID
        filename = self.caller.prefix_data + "results.csv"

        df = pd.read_csv(filename)
        df['ttest (-log10)'] = -pylab.log10(df['ttest'])
        # prevents inf to fail in the HTMLTable
        table = HTMLTable(df)
        table.add_bgcolor('bayes')
        table.add_bgcolor('Rp')
        table.df['drugid'] = [
            '<a href="drug_%s.html">%s</a>' % (x, x)
            for x in table.df['drugid']
        ]

        html = (
            "<div><p>This table contains links to all drugs (first column)."
            " The Rp column contains the coefficient of correlation"
            " (pearson) found with the regression method for the alpha"
            " parameter provided in column 3. The alpha value is the optimised"
            " value obtained using a cross validation (see below)."
            " The ln_alpha column is just the -log10(alpha) value. The bayes"
            " factor gives an idea of the significance of the correlation as "
            " compared to a null distribution. See"
            ' <a href="http://gdsctools.readthedocs.io/en/master/references.html">'
            'gdstools documentation.</a> for details.'
            "<br>"
            " Note also that the optimisation of the alpha parameter is"
            " performed using a cross validation and depends on a few"
            " parameters such as the range of alpha values, number of "
            " cross validation, ....</p>")
        html += table.to_html(index=False)

        pattern = '<div>%s <p>Download the CSV <a href="%s">file</a></p></div><hr>'
        pattern = pattern % (html, filename)
        html = pattern

        self.jinja['sections'].append(html)

        # The scatter plot. First the javascript in the header
        self._set_scatter()
        # and the section itself
        html = """
         <div class="wrap">
          <div class="content">
              <center>
                  <canvas id='canvasVolcano' width='800' height='540'></canvas>
              </center>
          </div>
          <div class="clear">&nbsp;</div>
        </div>
        """
        self.jinja["sections"].append(html)
Example #9
0
    def add_features(self):
        
        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = ["{}-{}".format(x, get_name(x)) for x in df_drugs.index]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        if len(self.report.df) == 0:
            return

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()

        df = df.reset_index() # get back the Drug id in the dframe columns
        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID', url="associations/drug_", suffix=".html")
        table.df.columns = [x.replace('ANOVA_FEATURE_FDR',
            'mean FEATURE ANOVA FDR') for x in table.df.columns]
        table.add_bgcolor('hits', mode='max',
                cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                header=True, index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE', url="associations/", suffix=".html")
        table.add_bgcolor('hits', mode='max',
                cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                header=True, index=False)
Example #10
0
    def get_html_table(self,
                       collapse_table=False,
                       clip_threshold=2,
                       index=False,
                       header=True,
                       escape=False,
                       add_href=True):
        """Return an HTML table for the reports


        :param add_href: add href to the FEATURE, DRUG ID and ASSOC ID

        """
        cmap_clip = cmap_builder('#ffffff', '#0070FF')
        cmap_absmax = cmap_builder('green', 'white', 'red')

        # The copy is used because we'll change it afterwards
        df = self.df[self.colnames_subset].copy()

        colname = 'ANOVA_FEATURE_FDR'

        df.loc[df[colname] < 0.01, colname] = '<0.01'
        # In the assoc column, we remove the first "a" letter so that
        # the column is properly sorted by Id but the link should be with the
        # "a" as prefix
        df.ASSOC_ID = df.ASSOC_ID.apply(lambda x: int(str(x).replace("a", "")))

        html = HTMLTable(df, 'notused')
        # Those columns should be links
        if add_href:
            html.add_href("FEATURE")
            html.add_href("ASSOC_ID", url="a",
                          suffix=".html")  # here url works like a prefix
            html.add_href("DRUG_ID", url="drug_",
                          suffix=".html")  # here url works like a prefix

        for this in [
                'FEATURE_IC50_effect_size', 'FEATURE_neg_Glass_delta',
                'FEATURE_pos_Glass_delta'
        ]:
            html.add_bgcolor(this,
                             cmap_clip,
                             mode='clip',
                             threshold=clip_threshold)

        # normalise data and annotate with color
        html.add_bgcolor('FEATURE_delta_MEAN_IC50', cmap_absmax, mode='absmax')

        html.df.columns = [x.replace("_", " ") for x in html.df.columns]
        return html.to_html(escape=escape,
                            header=header,
                            index=index,
                            collapse_table=collapse_table,
                            justify='center')
Example #11
0
    def create_summary_pages(self, main_directory="tissue_packages"):
        # Read in ALL all directories

        # create directories and copy relevant files
        directories = glob.glob(main_directory + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images', 'INPUT', 'OUTPUT', 'code', 'js' ]:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([tcga, total_hits,
                drug_inv_prop, info['N_prop'], 
                drug_inv_public, info['N_public']])
        df = pd.DataFrame(summary)
        df.columns = ['Analysis name', 'Number of hits', 
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of']
        

        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory=main_directory, 
                filename='index.html',
                template_filename='datapack_summary.html')

        # Let us use our HTMLTable to add the HTML references
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name', newtab=True, url=None,
                suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.jinja['collaborator'] = main_directory
        #self.html_page.jinja['analysis_domain'] = 'v18'
        self.html_page.write()

        return df
Example #12
0
    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename ])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename ])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename ])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep +filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results)>0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')
            

            summary.append([tcga, total_hits,
                drug_inv_prop, info['N_prop'], 
                drug_inv_public, info['N_public']])
        df = pd.DataFrame(summary)
        df.columns = ['Analysis name', 'Number of hits', 
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of']


        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL', filename='index.html',
                template_filename='datapack_summary.html' )

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name', newtab=True, url=None,
                suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df
Example #13
0
class GDSC(GDSCBase):
    """Alias to ANOVA class with default settings to loop over all TCGA Tissues
    and comnpanies.

    Reads

    1. Nf Genomic feature files for different TCGA types.
    2. a unique DRUG DECODER file
    3. a unique IC50 file

    and perform the Nf analysis saving results in appropriate files.

    Then split the data for each companies.

    It also converts tissue names into TCGA names.

    .. todo:: How to get the GF files

    First, create all main analysis that include all drugs::

        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')
        # identifies all genomic features GF* that contains specific TCGA GF
        gg.run() # This will take hours depending on the number of drugs.


        # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes
        # THe PANCAN data set is the largest and takes about 1 hour itself

    You should now have a directory called **ALL** with about 20 directories for
    each TCGA GF file. Keep that in a safe place or you will have to restart
    the analysis

    Second, split those data just created for each specific proprietary 
    compounds. For instance::

        gg.create_data_packages_for_companies(['AZ'])
    
    or for all in one go::

        gg.create_data_packages_for_companies()


    Third, create some summary pages::

        from gdsctools.gdsc import GDSCDirectorySummary()
        gs = GDSCDirectorySummary()
        gs.create_summary_pages('ALL')
        for company in gg.companies:
            gs.create_summary_pages(company)

    The last step is fast but the whole process of analyse and image 
    creation is very long. 



    """
    def __init__(self, ic50, drug_decode,
            genomic_feature_pattern="GF_*csv",
            mode='standard'):
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=True)
        self.debug = False
        self.ic50_filename = ic50
        self.dd_filename = drug_decode

        if mode == 'v18':
            self.ic50 = IC50Cluster(ic50)
        else:
            self.ic50 = IC50(ic50)

        self.drug_decode = DrugDecode(drug_decode)
        self.settings = ANOVASettings()
        self.settings.low_memory = True

        if mode == 'v18':
            self.settings.FDR_threshold = 35

        print("Those settings will be used (note that low_memory is set "
              "to True and check the value of FDR_threshold (set to 35 in v18)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

    def run(self):
        self.mkdir('ALL')
        # First analyse all case of TCGA + PANCAN once for all and
        # store all results in a dictionary.
        self._analyse_all()

    def _analyse_all(self):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print('================================ Analysing %s data' % tcga)

            self.mkdir('ALL' + os.sep + tcga)

            # Computes the ANOVA
            an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode)
            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.init() # reset the analysis_type automatically
            results = an.anova_all()

            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an, self.results[tcga])
            self.report.settings.savefig = True
            self.report.settings.directory = 'ALL/' + tcga
            self.report.settings.analysis_type = tcga
            self.report.create_html_pages()

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in drug_ids_in_results]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) 

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from 
                    # the analysis made in ALL 
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep )
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga,
                                os.sep, os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep,
                                tcga, os.sep , os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        pb.animate(i+1)

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]
    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']
    companies = property(_get_companies)

    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename ])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename ])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename ])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep +filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results)>0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')
            

            summary.append([tcga, total_hits,
                drug_inv_prop, info['N_prop'], 
                drug_inv_public, info['N_public']])
        df = pd.DataFrame(summary)
        df.columns = ['Analysis name', 'Number of hits', 
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of']


        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL', filename='index.html',
                template_filename='datapack_summary.html' )

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name', newtab=True, url=None,
                suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df

    def load_results(self):
        """Find the files results.csv in all TCGA directories"""
        for tcga in self.tcga:
            print(tcga)
            self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep +
                    'OUTPUT' + os.sep + 'results.csv')
Example #14
0
    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep + filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([
                tcga, total_hits, drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']
            ])
        df = pd.DataFrame(summary)
        df.columns = [
            'Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of'
        ]

        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL',
                                    filename='index.html',
                                    template_filename='datapack_summary.html')

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name',
                                 newtab=True,
                                 url=None,
                                 suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df
Example #15
0
class GDSC(GDSCBase):
    """Alias to ANOVA class with default settings to loop over all TCGA Tissues
    and comnpanies.

    Reads

    1. Nf Genomic feature files for different TCGA types.
    2. a unique DRUG DECODER file
    3. a unique IC50 file

    and perform the Nf analysis saving results in appropriate files.

    Then split the data for each companies.

    It also converts tissue names into TCGA names.

    .. todo:: How to get the GF files

    First, create all main analysis that include all drugs::

        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')
        # identifies all genomic features GF* that contains specific TCGA GF
        gg.run() # This will take hours depending on the number of drugs.


        # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes
        # THe PANCAN data set is the largest and takes about 1 hour itself

    You should now have a directory called **ALL** with about 20 directories for
    each TCGA GF file. Keep that in a safe place or you will have to restart
    the analysis

    Second, split those data just created for each specific proprietary 
    compounds. For instance::

        gg.create_data_packages_for_companies(['AZ'])
    
    or for all in one go::

        gg.create_data_packages_for_companies()


    Third, create some summary pages::

        from gdsctools.gdsc import GDSCDirectorySummary()
        gs = GDSCDirectorySummary()
        gs.create_summary_pages('ALL')
        for company in gg.companies:
            gs.create_summary_pages(company)

    The last step is fast but the whole process of analyse and image 
    creation is very long. 



    """
    def __init__(self,
                 ic50,
                 drug_decode,
                 genomic_feature_pattern="GF_*csv",
                 mode='standard'):
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=True)
        self.debug = False
        self.ic50_filename = ic50
        self.dd_filename = drug_decode

        if mode == 'v18':
            self.ic50 = IC50Cluster(ic50)
        else:
            self.ic50 = IC50(ic50)

        self.drug_decode = DrugDecode(drug_decode)
        self.settings = ANOVASettings()
        self.settings.low_memory = True

        if mode == 'v18':
            self.settings.FDR_threshold = 35

        print(
            "Those settings will be used (note that low_memory is set "
            "to True and check the value of FDR_threshold (set to 35 in v18)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

    def run(self):
        self.mkdir('ALL')
        # First analyse all case of TCGA + PANCAN once for all and
        # store all results in a dictionary.
        self._analyse_all()

    def _analyse_all(self):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print('================================ Analysing %s data' % tcga)

            self.mkdir('ALL' + os.sep + tcga)

            # Computes the ANOVA
            an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode)
            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.init()  # reset the analysis_type automatically
            results = an.anova_all()

            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an, self.results[tcga])
            self.report.settings.savefig = True
            self.report.settings.directory = 'ALL/' + tcga
            self.report.settings.analysis_type = tcga
            self.report.create_html_pages()

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]

    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']

    companies = property(_get_companies)

    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep + filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([
                tcga, total_hits, drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']
            ])
        df = pd.DataFrame(summary)
        df.columns = [
            'Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of'
        ]

        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL',
                                    filename='index.html',
                                    template_filename='datapack_summary.html')

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name',
                                 newtab=True,
                                 url=None,
                                 suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df

    def load_results(self):
        """Find the files results.csv in all TCGA directories"""
        for tcga in self.tcga:
            print(tcga)
            self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep +
                                              'OUTPUT' + os.sep +
                                              'results.csv')
    def _create_report(self, onweb=True):
        # A summary table
        diag = self.report.diagnostics()
        table = HTMLTable(diag, 'summary')
        txt = ''
        for index, row in diag.iterrows():
            if len(row.text) == 0 and len(row.value) == 0:
                txt += '----<br/>'
            else:
                txt += row.text + ": " +  str(row.value) + "<br/>"
        self.jinja['summary'] = txt

        print('Creating volcano plots')
        # this can be pretty slow. so keep only 1000 most relevant
        # values and 1000 random ones to get an idea of the distribution
        v = VolcanoANOVA(self.report.df, settings=self.settings)
        v.selector(v.df, 1500, 1500, inplace=True)
        v.volcano_plot_all()
        v.savefig_and_js("volcano_all_js")

        self.jinja['volcano'] = """
            <h3></h3>
            <a href="volcano_all_js.html">
                <img alt="volcano plot for all associations"
                    src="volcano_all_js.png">
            </a>
            <br/>
            <p> A javascript version is available
                <a href="volcano_all_js.html">here</a> (
                or click on the image).</p>
        """

        # MANOVA link
        N = len(self.report.get_significant_set())
        self.jinja['manova'] = """
        There were %(N)s significant associations found.
        All significant associations have been gatherered
        in the following link: <br/><a href="manova.html">manova results</a>.
        """ % {'N': N}

        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        # drug summary
        #not_tested = [x for x in self.report.gdsc.drugIds if x not in
        #        self.report.df.DRUG_ID.unique()]
        #if len(not_tested) > 0:
        #    not_tested = """%s drugs were not analysed due to
        #    lack of valid data points: """ % len(not_tested) + \
        #            ", ".join(not_tested)
        #else:
        #    not_tested = ""
        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()
        df = df.reset_index() # get back the Drug id in the dframe columns

        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID')
        table.df.columns = [x.replace('ANOVA_FEATURE_FDR',
            'mean FEATURE ANOVA FDR') for x in table.df.columns]
        table.add_bgcolor('hits', mode='max',
                cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                header=True, index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE')
        table.add_bgcolor('hits', mode='max',
                cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                header=True, index=False)

        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()

        # -------------------------------------- settings and INPUT files
        input_dir = self.directory + os.sep + 'INPUT'
        filename = 'ANOVA_input.csv'
        filename = os.sep.join([input_dir, filename])
        self.report.gdsc.ic50.to_csv(filename)
        filename = os.sep.join(['INPUT', 'ANOVA_input.csv'])
        self.jinja['ic50_file'] = filename

        # the genomic features, which may be the default version
        # one provided by the user. It may have been changed
        gf_filename = os.sep.join([input_dir, 'genomic_features.csv'])
        self.report.gdsc.features.to_csv(gf_filename)
        html = """Saved <a href="INPUT/genomic_features.csv">Genomic
                  Features</a> file<br/> (possibly the default
                  version)."""
        self.jinja['gf_file'] = html

        # Always save DRUG_DECODE file even if empty
        # It may be be interpreted in other pipeline or for reproducibility
        output_filename = input_dir + os.sep + 'DRUG_DECODE.csv'
        self.report.drug_decode.to_csv(output_filename)
        html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>'
        if len(self.report.drug_decode) == 0:
            html += 'Note that DRUG_DECODE file was not provided (empty?).'
        self.jinja['drug_decode'] = html

        # Save settings as json file
        filename = os.sep.join([input_dir, 'settings.json'])
        self.settings.to_json(filename)
        filename = os.path.basename(filename)
        self.jinja['settings'] = \
                """Get the settings as a <a href="INPUT/%s">
                json file</a>.""" % filename

        # Save all Results dataframe
        filename = os.sep.join([self.settings.directory, 'OUTPUT',
            'results.csv'])
        ANOVAResults(self.report.df).to_csv(filename)

        code = """from gdsctools import *
import os

def getfile(filename, where='../INPUT'):
    return os.sep.join([where, filename])

# reback the IC50 and genomic features matrices
gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'),
        getfile('DRUG_DECODE.csv'))
gdsc.settings.from_json(getfile('settings.json'))
gdsc.init()

# Analyse the data
results = gdsc.anova_all()

# Create the HTML report
r = ANOVAReport(gdsc, results)
r.create_html_pages(onweb=False)"""
        code = code % {
                'ic50': 'ANOVA_input.csv',
                'gf_filename': 'genomic_features.csv'}

        filename = os.sep.join([self.settings.directory, 'code','rerun.py'])
        fh = open(filename, 'w')
        fh.write(code)
        fh.close()
Example #17
0
    def add_features(self):
        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = [
                "{}-{}".format(x, get_name(x)) for x in df_drugs.index
            ]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        if len(self.report.df) == 0:
            return

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()

        df = df.reset_index()  # get back the Drug id in the dframe columns
        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID', url="associations/drug_", suffix=".html")
        table.df.columns = [
            x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR')
            for x in table.df.columns
        ]
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                                                 header=True,
                                                 index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE', url="associations/", suffix=".html")
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                                                    header=True,
                                                    index=False)
    def _create_report(self, onweb=True):
        # A summary table
        diag = self.report.diagnostics()
        table = HTMLTable(diag, 'summary')
        txt = ''
        for index, row in diag.iterrows():
            if len(row.text) == 0 and len(row.value) == 0:
                txt += '----<br/>'
            else:
                txt += row.text + ": " + str(row.value) + "<br/>"
        self.jinja['summary'] = txt

        print('Creating volcano plots')
        # this can be pretty slow. so keep only 1000 most relevant
        # values and 1000 random ones to get an idea of the distribution
        v = VolcanoANOVA(self.report.df, settings=self.settings)
        v.selector(v.df, 1500, 1500, inplace=True)
        v.volcano_plot_all()
        v.savefig_and_js("volcano_all_js")

        self.jinja['volcano'] = """
            <h3></h3>
            <a href="volcano_all_js.html">
                <img alt="volcano plot for all associations"
                    src="volcano_all_js.png">
            </a>
            <br/>
            <p> A javascript version is available
                <a href="volcano_all_js.html">here</a> (
                or click on the image).</p>
        """

        # MANOVA link
        N = len(self.report.get_significant_set())
        self.jinja['manova'] = """
        There were %(N)s significant associations found.
        All significant associations have been gatherered
        in the following link: <br/><a href="manova.html">manova results</a>.
        """ % {
            'N': N
        }

        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        # drug summary
        #not_tested = [x for x in self.report.gdsc.drugIds if x not in
        #        self.report.df.DRUG_ID.unique()]
        #if len(not_tested) > 0:
        #    not_tested = """%s drugs were not analysed due to
        #    lack of valid data points: """ % len(not_tested) + \
        #            ", ".join(not_tested)
        #else:
        #    not_tested = ""
        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()
        df = df.reset_index()  # get back the Drug id in the dframe columns

        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID')
        table.df.columns = [
            x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR')
            for x in table.df.columns
        ]
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                                                 header=True,
                                                 index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE')
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                                                    header=True,
                                                    index=False)

        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()

        # -------------------------------------- settings and INPUT files
        input_dir = self.directory + os.sep + 'INPUT'
        filename = 'ANOVA_input.csv'
        filename = os.sep.join([input_dir, filename])
        self.report.gdsc.ic50.to_csv(filename)
        filename = os.sep.join(['INPUT', 'ANOVA_input.csv'])
        self.jinja['ic50_file'] = filename

        # the genomic features, which may be the default version
        # one provided by the user. It may have been changed
        gf_filename = os.sep.join([input_dir, 'genomic_features.csv'])
        self.report.gdsc.features.to_csv(gf_filename)
        html = """Saved <a href="INPUT/genomic_features.csv">Genomic
                  Features</a> file<br/> (possibly the default
                  version)."""
        self.jinja['gf_file'] = html

        # Always save DRUG_DECODE file even if empty
        # It may be be interpreted in other pipeline or for reproducibility
        output_filename = input_dir + os.sep + 'DRUG_DECODE.csv'
        self.report.drug_decode.to_csv(output_filename)
        html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>'
        if len(self.report.drug_decode) == 0:
            html += 'Note that DRUG_DECODE file was not provided (empty?).'
        self.jinja['drug_decode'] = html

        # Save settings as json file
        filename = os.sep.join([input_dir, 'settings.json'])
        self.settings.to_json(filename)
        filename = os.path.basename(filename)
        self.jinja['settings'] = \
                """Get the settings as a <a href="INPUT/%s">
                json file</a>.""" % filename

        # Save all Results dataframe
        filename = os.sep.join(
            [self.settings.directory, 'OUTPUT', 'results.csv'])
        ANOVAResults(self.report.df).to_csv(filename)

        code = """from gdsctools import *
import os

def getfile(filename, where='../INPUT'):
    return os.sep.join([where, filename])

# reback the IC50 and genomic features matrices
gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'),
        getfile('DRUG_DECODE.csv'))
gdsc.settings.from_json(getfile('settings.json'))
gdsc.init()

# Analyse the data
results = gdsc.anova_all()

# Create the HTML report
r = ANOVAReport(gdsc, results)
r.create_html_pages(onweb=False)"""
        code = code % {
            'ic50': 'ANOVA_input.csv',
            'gf_filename': 'genomic_features.csv'
        }

        filename = os.sep.join([self.settings.directory, 'code', 'rerun.py'])
        fh = open(filename, 'w')
        fh.write(code)
        fh.close()