コード例 #1
0
    def volcano(self):
        """Calls :class:`VolcanoANOVA` on the results

        x-value is sign(FEATURE_delta_MEAN_IC50) times FEATURE_IC50_effect_size
        y-value is the FDR correction

        """
        self.handle_volcano = VolcanoANOVA(self.df, settings=self.settings)
        self.handle_volcano.volcano_plot_all()
コード例 #2
0
    def create_pictures(self):
        v = VolcanoANOVA(self.df, settings=self.settings)
        v.volcano_plot_one_drug(self.drug)
        v.savefig_and_js('images/volcano_{}'.format(self.drug), size_inches=(10,10))

        # See https://github.com/CancerRxGene/gdsctools/issues/79
        v.current_fig.canvas.mpl_disconnect(v.cid)
        try:
            import mpld3
            mpld3.plugins.clear(v.current_fig)
        except:
            pass
コード例 #3
0
ファイル: anova_results.py プロジェクト: shukwong/gdsctools
    def volcano(self, settings=None):
        """Calls :class:`VolcanoANOVA` on the results

        x-value is sign(FEATURE_delta_MEAN_IC50) times FEATURE_IC50_effect_size
        y-value is the FDR correction

        See the online documentation for details on gdsctools.readthedocs.io.

        """
        if len(self.df) == 0:
            print("No data to plot")
            return
        self.handle_volcano = VolcanoANOVA(self.df, settings=settings)
        self.handle_volcano.volcano_plot_all()
コード例 #4
0
    def create_pictures(self):
        v = VolcanoANOVA(self.df, settings=self.settings)
        v.volcano_plot_one_feature(self.feature)
        v.savefig_and_js('images/volcano_{}'.format(self.feature))

        # See https://github.com/CancerRxGene/gdsctools/issues/79
        v.current_fig.canvas.mpl_disconnect(v.cid)
        try:
            import mpld3
            mpld3.plugins.clear(v.current_fig)
        except:
            pass
コード例 #5
0
    def _create_report(self, onweb=True):
        # A summary table
        diag = self.report.diagnostics()
        table = HTMLTable(diag, 'summary')
        txt = ''
        for index, row in diag.iterrows():
            if len(row.text) == 0 and len(row.value) == 0:
                txt += '----<br/>'
            else:
                txt += row.text + ": " +  str(row.value) + "<br/>"
        self.jinja['summary'] = txt

        print('Creating volcano plots')
        # this can be pretty slow. so keep only 1000 most relevant
        # values and 1000 random ones to get an idea of the distribution
        v = VolcanoANOVA(self.report.df, settings=self.settings)
        v.selector(v.df, 1500, 1500, inplace=True)
        v.volcano_plot_all()
        v.savefig_and_js("volcano_all_js")

        self.jinja['volcano'] = """
            <h3></h3>
            <a href="volcano_all_js.html">
                <img alt="volcano plot for all associations"
                    src="volcano_all_js.png">
            </a>
            <br/>
            <p> A javascript version is available
                <a href="volcano_all_js.html">here</a> (
                or click on the image).</p>
        """

        # MANOVA link
        N = len(self.report.get_significant_set())
        self.jinja['manova'] = """
        There were %(N)s significant associations found.
        All significant associations have been gatherered
        in the following link: <br/><a href="manova.html">manova results</a>.
        """ % {'N': N}

        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        # drug summary
        #not_tested = [x for x in self.report.gdsc.drugIds if x not in
        #        self.report.df.DRUG_ID.unique()]
        #if len(not_tested) > 0:
        #    not_tested = """%s drugs were not analysed due to
        #    lack of valid data points: """ % len(not_tested) + \
        #            ", ".join(not_tested)
        #else:
        #    not_tested = ""
        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()
        df = df.reset_index() # get back the Drug id in the dframe columns

        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID')
        table.df.columns = [x.replace('ANOVA_FEATURE_FDR',
            'mean FEATURE ANOVA FDR') for x in table.df.columns]
        table.add_bgcolor('hits', mode='max',
                cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                header=True, index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE')
        table.add_bgcolor('hits', mode='max',
                cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                header=True, index=False)

        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()

        # -------------------------------------- settings and INPUT files
        input_dir = self.directory + os.sep + 'INPUT'
        filename = 'ANOVA_input.csv'
        filename = os.sep.join([input_dir, filename])
        self.report.gdsc.ic50.to_csv(filename)
        filename = os.sep.join(['INPUT', 'ANOVA_input.csv'])
        self.jinja['ic50_file'] = filename

        # the genomic features, which may be the default version
        # one provided by the user. It may have been changed
        gf_filename = os.sep.join([input_dir, 'genomic_features.csv'])
        self.report.gdsc.features.to_csv(gf_filename)
        html = """Saved <a href="INPUT/genomic_features.csv">Genomic
                  Features</a> file<br/> (possibly the default
                  version)."""
        self.jinja['gf_file'] = html

        # Always save DRUG_DECODE file even if empty
        # It may be be interpreted in other pipeline or for reproducibility
        output_filename = input_dir + os.sep + 'DRUG_DECODE.csv'
        self.report.drug_decode.to_csv(output_filename)
        html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>'
        if len(self.report.drug_decode) == 0:
            html += 'Note that DRUG_DECODE file was not provided (empty?).'
        self.jinja['drug_decode'] = html

        # Save settings as json file
        filename = os.sep.join([input_dir, 'settings.json'])
        self.settings.to_json(filename)
        filename = os.path.basename(filename)
        self.jinja['settings'] = \
                """Get the settings as a <a href="INPUT/%s">
                json file</a>.""" % filename

        # Save all Results dataframe
        filename = os.sep.join([self.settings.directory, 'OUTPUT',
            'results.csv'])
        ANOVAResults(self.report.df).to_csv(filename)

        code = """from gdsctools import *
import os

def getfile(filename, where='../INPUT'):
    return os.sep.join([where, filename])

# reback the IC50 and genomic features matrices
gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'),
        getfile('DRUG_DECODE.csv'))
gdsc.settings.from_json(getfile('settings.json'))
gdsc.init()

# Analyse the data
results = gdsc.anova_all()

# Create the HTML report
r = ANOVAReport(gdsc, results)
r.create_html_pages(onweb=False)"""
        code = code % {
                'ic50': 'ANOVA_input.csv',
                'gf_filename': 'genomic_features.csv'}

        filename = os.sep.join([self.settings.directory, 'code','rerun.py'])
        fh = open(filename, 'w')
        fh.write(code)
        fh.close()
コード例 #6
0
def test_volcano_plot():

    an = ANOVA(ic50_test)
    an.features.df = an.features.df[an.features.df.columns[0:10]]
    an = ANOVA(ic50_test, genomic_features=an.features.df)

    results = an.anova_all()

    # try the constructors
    v = VolcanoANOVA(results.df)
    v = VolcanoANOVA(results)

    # the selector metho
    v.df = v.selector(v.df)

    v.settings.savefig = False

    # some of the plotting
    v.volcano_plot_all_drugs()
    v.volcano_plot_all_features()
    v.volcano_plot_all()


    v._get_fdr_from_pvalue_interp(1e-10)
    v._get_pvalue_from_fdr(50)
    v._get_pvalue_from_fdr([50,60])
コード例 #7
0
class ANOVAResults(object):
    """Class to handle results of the ANOVA analysis

    The :class:`ANOVA` performs the regression and ANOVA analysis and 
    returns an :class:`ANOVAResults` instance (e.g., when you call 
    :meth:`gdsctools.anova.ANOVA.anova_all` method). The :class:`ANOVAResults`
    contains a dataframe with all results in :attr:`df`. 

    The columns of the dataframe are defined as follows:


    =========================== ===============================================
         Column name                                  Description
    =========================== ===============================================
    ASSOC_ID                    Alphanumerical identifier of the interaction
    FEATURE                     The CFE involved in the interaction, it
                                can be a mutated cancer driver gene
                                (CG) [suffix _mut], an abberrantly
                                fused protein [suffix fusion], a copy
                                number altered chromosomal region (RACS)
                                [prefix gain for amplifications or loss
                                for deletions];
    DRUG_ID                     Numerical id of the drug involved in the
                                interaction;
    DRUG_TARGET                 Putative target of the drug involved in the
                                interaction;
    N_FEATURE_pos               Number of cell lines harbouring the CFE
                                indicated in column E and that have been
                                screened with the drug indicated in columns
                                F and G, therefore have been included in
                                the test;
    N_FEATURE_neg               Number of cell lines not harbouring the
                                CFE indicated in column E and that have been
                                screened with the drug indicated in columns
                                F and G, therefore have been included in the
                                test;
    FEATURE_pos_logIC50_MEAN    Average log IC50 of the population of cell
                                lines accounted in colum i;
    FEATURE_neg_logIC50_MEAN    Average log IC50 of the population of cell
                                lines accounted in colum j;
    FEATURE_delta_MEAN_IC50     Difference between the two average natural
                                log IC50 values in the previous two columns
                                (j - i). A negative value indicates an
                                interaction for sensitivity, whereas a
                                positive value indicates an interaction
                                for resistance;
    FEATURE_pos_IC50_sd         Log IC50 Standard deviation for the
                                population of cell lines accounted in column
                                i;
    FEATURE_neg_IC50_sd         Log IC50 Standard deviation for the
                                population of cell lines accounted in column
                                j;
    FEATURE_IC50_effect_size    Cohen's d, quantifying the effect size of
                                the interaction. A value >=0.5 indicates
                                a moderate effect size. A value >=1
                                indicates a large effect size (i.e. difference
                                in mean log IC50 values greater than their
                                pooled standard deviations). A value >= 2
                                indicates a very large effect size (i.e.
                                difference in mean log IC50 is at least two
                                times their pooled standard deviation);
    FEATURE_pos_Glass_delta     Glass delta, quantifying the effect size of
                                the interaction as the ratio between the
                                difference of the mean log IC50 values and
                                the standard deviation of the log IC50 values
                                of the population of cell lines accounted in
                                column i;
    FEATURE_neg_Glass_delta     Glass delta Same as above for the negative set.
    ANOVA_FEATURE_pval          ANOVA test p-value quantyfing the interaction
                                significance;
    ANOVA_TISSUE_pval           ANOVA test p-value quantifying the
                                significance of the interaction between drug
                                response and the tissue of origin of the
                                cell lines; for the cancer-specific
                                interactions this value is NA;
    ANOVA_MEDIA_pval            ANOVA test p-value quantifying the
                                significance of the interaction between drug
                                response and the screening medium of the cell
                                lines; for the cancer-specific interactions
                                this value is NA;
    ANOVA_MSI_pval              ANOVA test p-value quantifying the
                                significance of the interaction between drug
                                response and the micro-satellite instability
                                status of the cell lines; for the cancer type
                                with no micro-satellite instable cell line
                                samples this value is NA;
    ANOVA_FEATURE_FDR           False discovery rate obtained by correcting
                                the p-values in column u, on an individual
                                analysis basis, for multiple hypothesis
                                testing with the q-value correction method
                                (Storey & TIbshirani, 2003)
    =========================== ===============================================

    Note that those column names are renamed internally (and if the data is
    saved in a new file).


    """
    _colname_drug_id = 'DRUG_ID'

    def __init__(self, filename=None):
        """.. rubric:: Constructor

        :param str filename: Another ANOVAResults instance of a saved
            dataframe that can be read by this class, that is a CSV
            with the official header. This parameter can also be set
            to None (default) and populated later.

        """
        if filename is not None and isinstance(filename, str):
            self.read_csv(filename)
        elif filename is None:
            self._df = pd.DataFrame()
        else:
            try:
                self._df = filename.df.copy()
            except:
                self._df = filename.copy()
            assert isinstance(self._df, pd.core.frame.DataFrame), \
                "excepts a dataframe or filename"

        #: dictionary with the relevant column names and their expected types
        self.mapping = OrderedDict()
        self.mapping['ASSOC_ID'] =  np.dtype('int64')
        self.mapping['FEATURE'] = np.dtype('O')
        self.mapping['DRUG_ID'] = np.dtype('O')
        self.mapping['DRUG_NAME'] = np.dtype('O')
        self.mapping['DRUG_TARGET'] = np.dtype('O')
        self.mapping['N_FEATURE_neg'] = np.dtype('int64')
        self.mapping['N_FEATURE_pos'] = np.dtype('int64')
        self.mapping['FEATURE_pos_logIC50_MEAN'] = np.dtype('float64')
        self.mapping['FEATURE_neg_logIC50_MEAN'] = np.dtype('float64')
        self.mapping['FEATURE_delta_MEAN_IC50'] = np.dtype('float64')
        self.mapping['FEATURE_IC50_effect_size'] = np.dtype('float64')
        self.mapping['FEATURE_neg_Glass_delta'] = np.dtype('float64')
        self.mapping['FEATURE_pos_Glass_delta'] = np.dtype('float64')
        self.mapping['FEATURE_neg_IC50_sd'] = np.dtype('float64')
        self.mapping['FEATURE_pos_IC50_sd'] = np.dtype('float64')
        self.mapping['FEATURE_IC50_T_pval'] = np.dtype('float64')
       
        self.mapping['ANOVA_FEATURE_pval'] = np.dtype('float64')
        self.mapping['ANOVA_TISSUE_pval'] = np.dtype('float64')
        self.mapping['ANOVA_MSI_pval'] = np.dtype('float64')
        self.mapping['ANOVA_MEDIA_pval'] = np.dtype('float64')
  
        self.mapping['ANOVA_FEATURE_FDR'] = np.dtype('float64')

        # before gdsctools, columns names were a bit different.
        # We need to rename some column names
        self.df.rename(columns={
            'assoc_id': 'ASSOC_ID',
            'Drug id': 'DRUG_ID',
            'Owned_by': 'OWNED_BY',
            'FEATUREpos_IC50_sd': 'FEATURE_pos_IC50_sd',
            'FEATUREneg_IC50_sd': 'FEATURE_neg_IC50_sd',
            'FEATUREpos_Glass_delta': 'FEATURE_pos_Glass_delta',
            'FEATUREneg_Glass_delta': 'FEATURE_neg_Glass_delta',
            'FEATUREpos_logIC50_MEAN': 'FEATURE_pos_logIC50_MEAN',
            'FEATUREneg_logIC50_MEAN': 'FEATURE_neg_logIC50_MEAN',
            'Drug Target': 'DRUG_TARGET',
            'FEATURE_deltaMEAN_IC50': 'FEATURE_delta_MEAN_IC50',
            'FEATURE_ANOVA_pval': 'ANOVA_FEATURE_pval',
            'ANOVA FEATURE FDR %': 'ANOVA_FEATURE_FDR',
            'MSI_ANOVA_pval': 'ANOVA_MSI_pval',
            'Tissue_ANOVA_pval': 'TISSUE_ANOVA_pval',
            'Drug name': 'DRUG_NAME', 'A':'B'}, inplace=True)

        self.colnames_subset = [
            'ASSOC_ID', 'FEATURE',
            'DRUG_ID', 'DRUG_NAME', 'DRUG_TARGET',
            'N_FEATURE_neg', 'N_FEATURE_pos',
            'FEATURE_pos_logIC50_MEAN', 'FEATURE_neg_logIC50_MEAN',
            'FEATURE_delta_MEAN_IC50',
            'FEATURE_IC50_effect_size',
            'FEATURE_neg_Glass_delta', 'FEATURE_pos_Glass_delta',
            'ANOVA_FEATURE_pval',
            'ANOVA_TISSUE_pval',
            'ANOVA_MSI_pval',
            'ANOVA_MEDIA_pval',
            'ANOVA_FEATURE_FDR']

        self._df.reset_index(drop=True)

    def astype(self, df):
        try:
            # does not work in python3.3 on travis but should work
            # we newer pandas version.
            df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
        except:
            for col in df.columns:
                if col in self.mapping.keys():
                    df[col] = df[col].astype(self.mapping[col])
        return df

    def _get_df(self):
        return self._df
    def _set_df(self, df):
        # TODO check that all columns are found and with correct type.
        self._df = df
    df = property(_get_df, _set_df, doc="dataframe with all results")

    def to_csv(self, filename):
        """Save dataframe into a file using comma separated values"""
        assert filename.endswith('.csv'), "filename should end in .csv"
        self.df.to_csv(filename, sep=',', index=False)

    def read_csv(self, filename):
        """Read a CSV file

        .. todo:: check validity of the header
        """
        self.reader = readers.Reader(filename)
        self._df = self.reader.df

    def __len__(self):
        return len(self.df)

    def _get_drugIds(self):
        if len(self) == 0:
            return []
        else:
            return self.df[self._colname_drug_id].unique()
    drugIds = property(_get_drugIds,
            doc="Returns the list of drug identifiers")

    def volcano(self):
        """Calls :class:`VolcanoANOVA` on the results

        x-value is sign(FEATURE_delta_MEAN_IC50) times FEATURE_IC50_effect_size
        y-value is the FDR correction

        """
        self.handle_volcano = VolcanoANOVA(self.df, settings=self.settings)
        self.handle_volcano.volcano_plot_all()

    def __str__(self):
        txt = 'Total number of ANOVA tests performed: %s ' % len(self.df) 
        return txt 
    
    def __repr__(self):
        txt = 'ANOVAResults (%s tests): ' % len(self.df) 
        return txt 

    def copy(self):
        a = ANOVAResults(self.df.copy())
        return 

    def get_html_table(self, collapse_table=False, clip_threshold=2,
            index=False, header=True, escape=False):
        cmap_clip = cmap_builder('#ffffff', '#0070FF')
        cmap_absmax = cmap_builder('green', 'white', 'red')

        columns = ANOVAResults().colnames_subset

        # The copy is used because we'll change it afterwards
        df = self.df[self.colnames_subset].copy()

        colname = 'ANOVA_FEATURE_FDR'

        df.loc[self.df[colname] < 0.01, colname] = '<0.01'

        html = HTMLTable(self.df, 'notused')
        # Those columns should be links
        for this in ['FEATURE', 'DRUG_ID', 'ASSOC_ID']:
            html.add_href(this)

        for this in ['FEATURE_IC50_effect_size', 'FEATURE_neg_Glass_delta',
                'FEATURE_pos_Glass_delta']:
            html.add_bgcolor(this, cmap_clip, mode='clip',
                    threshold=clip_threshold)

        # normalise data and annotate with color
        html.add_bgcolor('FEATURE_delta_MEAN_IC50', cmap_absmax,
            mode='absmax')

        html.df.columns = [x.replace("_", " ") for x in html.df.columns]
        return html.to_html(escape=escape, header=header, index=index,
                collapse_table=collapse_table, justify='center')

    def barplot_effect_size(self):

        # barplot of the IC50 effect size
        data = np.sign(self.df.FEATURE_delta_MEAN_IC50) * self.df.FEATURE_IC50_effect_size
        data = data.sort_values()

        n_green = len(data[data<0])
        n_red = len(data[data>=0])

        print(n_green, n_red)
        data.plot(kind='barh', width=1, alpha=0.5, 
            color=['green']*n_green + ['red'] * n_red)

        pylab.xlabel("Effect size")
        pylab.ylabel("Drug name")
コード例 #8
0
    def _create_report(self, onweb=True):
        # A summary table
        diag = self.report.diagnostics()
        table = HTMLTable(diag, 'summary')
        txt = ''
        for index, row in diag.iterrows():
            if len(row.text) == 0 and len(row.value) == 0:
                txt += '----<br/>'
            else:
                txt += row.text + ": " + str(row.value) + "<br/>"
        self.jinja['summary'] = txt

        print('Creating volcano plots')
        # this can be pretty slow. so keep only 1000 most relevant
        # values and 1000 random ones to get an idea of the distribution
        v = VolcanoANOVA(self.report.df, settings=self.settings)
        v.selector(v.df, 1500, 1500, inplace=True)
        v.volcano_plot_all()
        v.savefig_and_js("volcano_all_js")

        self.jinja['volcano'] = """
            <h3></h3>
            <a href="volcano_all_js.html">
                <img alt="volcano plot for all associations"
                    src="volcano_all_js.png">
            </a>
            <br/>
            <p> A javascript version is available
                <a href="volcano_all_js.html">here</a> (
                or click on the image).</p>
        """

        # MANOVA link
        N = len(self.report.get_significant_set())
        self.jinja['manova'] = """
        There were %(N)s significant associations found.
        All significant associations have been gatherered
        in the following link: <br/><a href="manova.html">manova results</a>.
        """ % {
            'N': N
        }

        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        # drug summary
        #not_tested = [x for x in self.report.gdsc.drugIds if x not in
        #        self.report.df.DRUG_ID.unique()]
        #if len(not_tested) > 0:
        #    not_tested = """%s drugs were not analysed due to
        #    lack of valid data points: """ % len(not_tested) + \
        #            ", ".join(not_tested)
        #else:
        #    not_tested = ""
        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()
        df = df.reset_index()  # get back the Drug id in the dframe columns

        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID')
        table.df.columns = [
            x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR')
            for x in table.df.columns
        ]
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                                                 header=True,
                                                 index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE')
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                                                    header=True,
                                                    index=False)

        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()

        # -------------------------------------- settings and INPUT files
        input_dir = self.directory + os.sep + 'INPUT'
        filename = 'ANOVA_input.csv'
        filename = os.sep.join([input_dir, filename])
        self.report.gdsc.ic50.to_csv(filename)
        filename = os.sep.join(['INPUT', 'ANOVA_input.csv'])
        self.jinja['ic50_file'] = filename

        # the genomic features, which may be the default version
        # one provided by the user. It may have been changed
        gf_filename = os.sep.join([input_dir, 'genomic_features.csv'])
        self.report.gdsc.features.to_csv(gf_filename)
        html = """Saved <a href="INPUT/genomic_features.csv">Genomic
                  Features</a> file<br/> (possibly the default
                  version)."""
        self.jinja['gf_file'] = html

        # Always save DRUG_DECODE file even if empty
        # It may be be interpreted in other pipeline or for reproducibility
        output_filename = input_dir + os.sep + 'DRUG_DECODE.csv'
        self.report.drug_decode.to_csv(output_filename)
        html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>'
        if len(self.report.drug_decode) == 0:
            html += 'Note that DRUG_DECODE file was not provided (empty?).'
        self.jinja['drug_decode'] = html

        # Save settings as json file
        filename = os.sep.join([input_dir, 'settings.json'])
        self.settings.to_json(filename)
        filename = os.path.basename(filename)
        self.jinja['settings'] = \
                """Get the settings as a <a href="INPUT/%s">
                json file</a>.""" % filename

        # Save all Results dataframe
        filename = os.sep.join(
            [self.settings.directory, 'OUTPUT', 'results.csv'])
        ANOVAResults(self.report.df).to_csv(filename)

        code = """from gdsctools import *
import os

def getfile(filename, where='../INPUT'):
    return os.sep.join([where, filename])

# reback the IC50 and genomic features matrices
gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'),
        getfile('DRUG_DECODE.csv'))
gdsc.settings.from_json(getfile('settings.json'))
gdsc.init()

# Analyse the data
results = gdsc.anova_all()

# Create the HTML report
r = ANOVAReport(gdsc, results)
r.create_html_pages(onweb=False)"""
        code = code % {
            'ic50': 'ANOVA_input.csv',
            'gf_filename': 'genomic_features.csv'
        }

        filename = os.sep.join([self.settings.directory, 'code', 'rerun.py'])
        fh = open(filename, 'w')
        fh.write(code)
        fh.close()