Example #1
0
class ANOVAReport(object):
    """Class used to interpret the results and create final HTML report

    Results is a data structure returned by :meth:`ANOVA.anova_all`.

    ::

        from gdsctools import *

        # Perform the analysis itself to get a set of results (dataframe)
        an = ANOVA(ic50_test)
        results = an.anova_all()

        # now, we can create the report.
        r = ANOVAReport(gdsc=an, results=results)

        # we can tune some settings
        r.settings.pvalue_threshold = 0.001
        r.settings.FDR_threshold = 28
        r.settings.directory = 'testing'
        r.create_html_pages()


    .. rubric:: Significant association

    An association is significant if

      - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold
      - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold

    It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is 
    below 0, otherwise it is **resistant**.


    """
    def __init__(self,
                 gdsc,
                 results=None,
                 sep="\t",
                 drug_decode=None,
                 verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf: 0, -np.inf: 0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

        # just to create the directory
        # ReportMain(directory=self.settings.directory, verbose=self.verbose)

    def _get_ndrugs(self):
        return len(self.df[self._colname_drug_id].unique())

    n_drugs = property(_get_ndrugs, doc="return number of drugs")

    def _get_ntests(self):
        return len(self.df.index)

    n_tests = property(_get_ntests)

    def _get_ncelllines(self):
        return len(self.gdsc.features.df.index)

    n_celllines = property(_get_ncelllines, doc="return number of cell lines")

    def _df_append(self, df, data):
        count = len(df)
        df.loc[count] = data
        return df

    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests) / (N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens + nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df

    def _get_pval_range(self):
        """Get pvalues range of the significant hits"""
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        N = nsens + nres
        if N == 0:
            return 0., 0.
        name = self.varname_pval
        data = self.df[name].iloc[0:N]
        m, M = data.min(), data.max()
        return m, M

    def _get_fdr_range(self):
        """Get FDR range of the significant hits"""
        name = self.varname_qval
        data = self.df[name][(self.df[name] < self.settings.FDR_threshold)]
        if len(data) == 0:
            return 0., 0.
        m, M = data.min(), data.max()
        return m, M

    def _set_sensible_df(self):
        # just an alias
        logand = np.logical_and

        # select sensible data set
        mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold
        mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0
        self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)]

        # select resistant data set
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0
        self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]

    def get_significant_set(self):
        """Return significant hits (resistant and sensible)"""
        # a property that is long to compute
        # and may change if FDR changes.
        self._set_sensible_df()
        df = pd.concat([self.sensible_df, self.resistant_df])
        try:
            df.sort_values('ASSOC_ID', inplace=True)
        except:
            df.sort('ASSOC_ID', inplace=True)
        return df

    def _get_data(self, df_count_sensible, df_count_resistant):
        # we can drop all columns except one, which is renamed as count
        df1 = df_count_sensible['ASSOC_ID']
        df1.name = 'sens assoc'
        df2 = df_count_resistant['ASSOC_ID']
        df2.name = 'res assoc'

        # Now, we join the two TimeSeries (note that above, we selected only
        # one column so the dataframe was downcast to time series)
        df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer')
        # and set NA to zero
        df_count.fillna(0, inplace=True)
        # let us add a new column with the total
        df_count['total'] = df_count['sens assoc'] + df_count['res assoc']

        # we want to sort by 'total' column and is equality by the name,
        # which is the index. So let us add the index temporarily as
        # a column, sort, and remove 'name' column afterwards
        df_count['name'] = df_count.index
        try:
            df_count.sort_values(by=['total', 'name'],
                                 ascending=[False, True],
                                 inplace=True)
        except:
            df_count.sort(columns=['total', 'name'],
                          ascending=[False, True],
                          inplace=True)

        df_count.drop('name', axis=1, inplace=True)
        return df_count

    def get_drug_summary_data(self):
        """Return dataframe with drug summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()

        # group by drug
        colname = self._colname_drug_id
        df_count_sensible = self.sensible_df.groupby(colname).count()
        df_count_resistant = self.resistant_df.groupby(colname).count()

        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def drug_summary(self, top=50, fontsize=15, filename=None):
        """Return dataframe with significant drugs and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """

        df_count = self.get_drug_summary_data()

        if len(df_count):
            self._plot(df_count, 'drug', top)
            fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename,
                                  size_inches=(12, 14),
                                  bbox_inches='tight')
        return df_count

    def get_feature_summary_data(self):
        """Return dataframe with feature summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()
        df_count_sensible = self.sensible_df.groupby('FEATURE').count()
        df_count_resistant = self.resistant_df.groupby('FEATURE').count()
        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def feature_summary(self, filename=None, top=50, fontsize=15):
        """Return dataframe with significant features and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """
        df_count = self.get_feature_summary_data()

        if len(df_count) > 0:
            self._plot(df_count, 'feature', top)
            #fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename,
                                  set_inches=(12, 14),
                                  bbox_inches='tight')
        return df_count

    def _plot(self, df_count, title_tag, top):
        """Used by drug_summary and feature_summary to plot the
        bar plot"""
        if top > len(df_count):
            top = len(df_count)

        df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']]
        labels = list(df.index)
        # add drug name
        if len(self.drug_decode) > 0:
            for i, label in enumerate(labels):
                if title_tag == 'drug':
                    name = self.drug_decode.get_name(label)
                    if name is not None:
                        labels[i] = "{}-{}".format(labels[i], name)
                else:
                    pass

        labels = [str(x).replace('_', ' ') for x in labels]
        # restrict size to first 30 characters
        labels = [x[0:30] for x in labels]
        ind = range(0, len(labels))
        # reverse does not exist with python3
        try:
            ind.reverse()
        except:
            ind = list(ind)
            ind.reverse()
        data1 = df['sens assoc'].values
        data2 = df['res assoc'].values
        pylab.figure(1)
        pylab.clf()
        p1 = pylab.barh(ind,
                        data1,
                        height=0.8,
                        color='purple',
                        label='sensitivity')
        p2 = pylab.barh(ind,
                        data2,
                        height=0.8,
                        color='orange',
                        left=data1,
                        label='resistance')
        ax = pylab.gca()
        self.labels = labels
        ax.set_yticks([x + 0.5 for x in ind])
        ax.set_yticklabels(labels, fontsize=12)

        xticks = ax.get_xticks()
        ax.set_xticklabels(
            [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks])

        pylab.grid()
        pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \
                    "\nassociated with drug  response",
                    fontsize=self.settings.fontsize/1.2)
        pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' %
                     ("$>$", self.settings.FDR_threshold, "$\%$"),
                     fontsize=16)

        M = max(data1 + data2)
        #ax.set_xticks()
        #ax.set_xticklabels(labels, fontsize=fontsize)
        ax.set_xlim([0, M + 1])
        pylab.legend(loc='lower right')
        try:
            pylab.tight_layout()
        except:
            pass

    def get_significant_hits(self, show=True):
        """Return a summary of significant hits

        :param show: show a plot with the distribution of significant hits

        .. todo:: to finalise
        """
        fdrs = range(5, 50 + 1, 5)

        significants = []
        significant_meaningful = []
        strong_hits = []
        full_strong_hits = []

        MC1 = 1
        MC2 = 2
        mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1
        mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2
        mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1
        mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2
        maskMC = mask2 + mask3 + mask4 + mask5

        for fdr in fdrs:
            # significant hits
            res = self.df['ANOVA_FEATURE_FDR'] < fdr
            significants.append(res.sum())

            # meaningful hits
            indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr,
                                     maskMC)
            significant_meaningful.append(indices.sum())

            # meaningful strong hits
            mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1
            strong_hits.append(np.logical_or(mask1, mask2).sum())

            # meaningful full strong hits
            mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1
            full_strong_hits.append(np.logical_and(mask1, mask2).sum())

        data = {
            'significants': significants,
            'full_strong_hits': full_strong_hits,
            'strong_hits': strong_hits,
            'significant_meaningful': significant_meaningful
        }

        df = pd.DataFrame(data,
                          columns=[
                              'significants', 'significant_meaningful',
                              'strong_hits', 'full_strong_hits'
                          ],
                          index=fdrs)
        df.columns = [
            '1) significant', '2) 1 + meaningful', '3) 2 + strong',
            '4) 2+ very strong'
        ]

        if show is True:
            pylab.clf()
            ax = pylab.gca()
            df.plot(kind='bar',
                    width=.8,
                    color=['r', 'gray', 'orange', 'black'],
                    rot=0,
                    ax=ax)
            pylab.grid()
        # original is 'aquamarine4','cyan2','cornflowerblue    ','aquamarine'),
        return df

    def __str__(self):
        self.df.info()
        return ""

    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        if self.verbose:
            print(
                "Creating individual HTML pages for each significant association"
            )
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self,
                           drug='dummy',
                           feature='dummy',
                           fdr='dummy',
                           company=self.company)

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            if str(assocs[i]).startswith("a"):
                html._filename = str(assocs[i]) + '.html'
            else:
                html._filename = "a" + str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            #html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_features(self):
        """Create an HTML page for each significant feature"""
        df = self.get_significant_set()
        groups = df.groupby('FEATURE')
        if self.verbose:
            print("Creating individual HTML pages for each feature")
        N = len(groups.indices.keys())
        pb = Progress(N)
        for i, feature in enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            subdf = groups.get_group(feature)
            html = HTMLOneFeature(self, self.df, subdf, feature)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_main(self, onweb=False):
        """Create HTML main document (summary)"""
        self._set_sensible_df()

        if self.verbose:
            print("Creating main HTML page in directory %s" %
                  (self.settings.directory))
        ReportMain(directory=self.settings.directory, verbose=self.verbose)

        buffer_ = self.settings.savefig
        self.settings.savefig = True
        html = HTMLPageMain(self, 'index.html')
        html._init_report()  # created the directory
        html.create_report(onweb=onweb)
        self.settings.savefig = buffer_

    def create_html_manova(self, onweb=True):
        """Create summary table with all significant hits

        :param onweb: open browser with the created HTML page.

        """
        df = self.get_significant_set()
        page = HTMLPageMANOVA(self, df, self.company)
        page.create_report(onweb)

    def create_html_pages(self, onweb=True):
        """Create all HTML pages"""
        self.create_html_main(onweb=onweb)
        self.create_html_manova(onweb=False)
        self.create_html_drugs()
        self.create_html_features()
        self.create_html_associations()

    def onweb(self):
        from easydev import onweb
        onweb(self.settings.directory + os.sep + 'index.html')
Example #2
0
class ANOVAReport(object):
    """Class used to interpret the results and create final HTML report

    Results is a data structure returned by :meth:`ANOVA.anova_all`.

    ::

        from gdsctools import *

        # Perform the analysis itself to get a set of results (dataframe)
        an = ANOVA(ic50_test)
        results = an.anova_all()

        # now, we can create the report.
        r = ANOVAReport(gdsc=an, results=results)

        # we can tune some settings
        r.settings.pvalue_threshold = 0.001
        r.settings.FDR_threshold = 28
        r.settings.directory = 'testing'
        r.create_html_pages()


    .. rubric:: Significant association

    An association is significant if

      - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold
      - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold
    It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is 
    below 0, otherwise it is **resistant**.


    """
    def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, 
            verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df # this does a copy and sanity check


        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)


        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf:0, -np.inf:0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

        # just to create the directory
        # ReportMain(directory=self.settings.directory, verbose=self.verbose)

    def _get_ndrugs(self):
        return len(self.df[self._colname_drug_id].unique())
    n_drugs = property(_get_ndrugs, doc="return number of drugs")

    def _get_ntests(self):
        return len(self.df.index)
    n_tests = property(_get_ntests)

    def _get_ncelllines(self):
        return len(self.gdsc.features.df.index)
    n_celllines = property(_get_ncelllines,
            doc="return number of cell lines")

    def _df_append(self, df, data):
        count = len(df)
        df.ix[count] = data
        return df

    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests)/(N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens+nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df

    def _get_pval_range(self):
        """Get pvalues range of the significant hits"""
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        N = nsens + nres
        if N == 0:
            return 0., 0.
        name = self.varname_pval
        data = self.df[name].ix[0:N-1]
        m, M = data.min(), data.max()
        return m, M

    def _get_fdr_range(self):
        """Get FDR range of the significant hits"""
        name = self.varname_qval
        data = self.df[name][(self.df[name] < self.settings.FDR_threshold)]
        if len(data) == 0:
            return 0., 0.
        m, M = data.min(), data.max()
        return m, M

    def _set_sensible_df(self):
        # just an alias
        logand = np.logical_and

        # select sensible data set
        mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold
        mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0
        self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)]

        # select resistant data set
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0
        self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]

    def get_significant_set(self):
        """Return significant hits (resistant and sensible)"""
        # a property that is long to compute
        # and may change if FDR changes.
        self._set_sensible_df()
        df = pd.concat([self.sensible_df, self.resistant_df])
        try:
            df.sort_values('ASSOC_ID', inplace=True)
        except:
            df.sort('ASSOC_ID', inplace=True)
        return df

    def _get_data(self, df_count_sensible, df_count_resistant):
        # we can drop all columns except one, which is renamed as count
        df1 = df_count_sensible['ASSOC_ID']
        df1.name = 'sens assoc'
        df2 = df_count_resistant['ASSOC_ID']
        df2.name = 'res assoc'

        # Now, we join the two TimeSeries (note that above, we selected only
        # one column so the dataframe was downcast to time series)
        df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer')
        # and set NA to zero
        df_count.fillna(0, inplace=True)
        # let us add a new column with the total
        df_count['total'] = df_count['sens assoc'] + df_count['res assoc']

        # we want to sort by 'total' column and is equality by the name,
        # which is the index. So let us add the index temporarily as
        # a column, sort, and remove 'name' column afterwards
        df_count['name'] = df_count.index
        try:
            df_count.sort_values(by=['total', 'name'], 
                    ascending=[False, True], inplace=True)
        except:
            df_count.sort(columns=['total', 'name'], 
                    ascending=[False, True], inplace=True)

        df_count.drop('name', axis=1, inplace=True)
        return df_count

    def get_drug_summary_data(self):
        """Return dataframe with drug summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()

        # group by drug
        colname = self._colname_drug_id
        df_count_sensible = self.sensible_df.groupby(colname).count()
        df_count_resistant = self.resistant_df.groupby(colname).count()

        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def drug_summary(self,  top=50, fontsize=15, filename=None):
        """Return dataframe with significant drugs and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """

        df_count = self.get_drug_summary_data()

        if len(df_count):
            self._plot(df_count, 'drug', top)
            fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename, size_inches=(12, 14),
                    bbox_inches='tight')
        return df_count

    def get_feature_summary_data(self):
        """Return dataframe with feature summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()
        df_count_sensible = self.sensible_df.groupby('FEATURE').count()
        df_count_resistant = self.resistant_df.groupby('FEATURE').count()
        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def feature_summary(self, filename=None, top=50, fontsize=15):
        """Return dataframe with significant features and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """
        df_count = self.get_feature_summary_data()

        if len(df_count) > 0:
            self._plot(df_count, 'feature', top)
            #fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename, set_inches=(12, 14),
                    bbox_inches='tight')
        return df_count

    def _plot(self, df_count, title_tag, top):
        """Used by drug_summary and feature_summary to plot the
        bar plot"""
        if top > len(df_count):
            top = len(df_count)

        df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']]
        labels = list(df.index)
        # add drug name
        if len(self.drug_decode) > 0:
            for i, label in enumerate(labels):
                if title_tag == 'drug':
                    name = self.drug_decode.get_name(label)
                    if name is not None:
                        labels[i] = "{}-{}".format(labels[i], name)
                else:
                    pass

        labels = [str(x).replace('_', ' ') for x in labels]
        # restrict size to first 30 characters
        labels = [x[0:30] for x in labels]
        ind = range(0, len(labels))
        # reverse does not exist with python3
        try:
            ind.reverse()
        except:
            ind = list(ind)
            ind.reverse()
        data1 = df['sens assoc'].values
        data2 = df['res assoc'].values
        pylab.figure(1)
        pylab.clf()
        p1 = pylab.barh(ind, data1, height=0.8, color='purple',
                        label='sensitivity')
        p2 = pylab.barh(ind, data2, height=0.8, color='orange',
                        left=data1, label='resistance')
        ax = pylab.gca()
        self.labels = labels
        ax.set_yticks([x + 0.5 for x in ind])
        ax.set_yticklabels(labels, fontsize=12)

        xticks = ax.get_xticks()
        ax.set_xticklabels(
                [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks])


        pylab.grid()
        pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \
                    "\nassociated with drug  response",
                    fontsize=self.settings.fontsize/1.2)
        pylab.xlabel(r'Number of significant associations (FDR %s %s %s) '
                     % ("$>$", self.settings.FDR_threshold, "$\%$"),
                     fontsize=18)

        M = max(data1+data2)
        #ax.set_xticks()
        #ax.set_xticklabels(labels, fontsize=fontsize)
        ax.set_xlim([0, M+1])
        pylab.legend(loc='lower right')
        try:pylab.tight_layout()
        except:pass

    def get_significant_hits(self,  show=True):
        """Return a summary of significant hits

        :param show: show a plot with the distribution of significant hits

        .. todo:: to finalise
        """
        fdrs = range(5, 50+1, 5)

        significants = []
        significant_meaningful = []
        strong_hits = []
        full_strong_hits = []

        MC1 = 1
        MC2 = 2
        mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1
        mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2
        mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1
        mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2
        maskMC = mask2 + mask3 + mask4 + mask5

        for fdr in fdrs:
            # significant hits
            res = self.df['ANOVA_FEATURE_FDR'] < fdr
            significants.append(res.sum())

            # meaningful hits
            indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr,
                    maskMC)
            significant_meaningful.append(indices.sum())

            # meaningful strong hits
            mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1
            strong_hits.append(np.logical_or(mask1, mask2).sum())

            # meaningful full strong hits
            mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1
            full_strong_hits.append(np.logical_and(mask1, mask2).sum())

        data = {'significants': significants,
                'full_strong_hits': full_strong_hits,
               'strong_hits': strong_hits,
               'significant_meaningful': significant_meaningful}

        df = pd.DataFrame(data, columns = ['significants',
            'significant_meaningful', 'strong_hits', 'full_strong_hits'],
            index=fdrs)
        df.columns = ['1) significant', '2) 1 + meaningful',
        '3) 2 + strong', '4) 2+ very strong']

        if show is True:
            pylab.clf()
            ax = pylab.gca()
            df.plot(kind='bar', width=.8,
                    color=['r', 'gray', 'orange', 'black'],
                    rot=0, ax=ax)
            pylab.grid()
        # original is 'aquamarine4','cyan2','cornflowerblue    ','aquamarine'),
        return df

    def __str__(self):
        self.df.info()
        return ""

    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        if self.verbose:
            print("Creating individual HTML pages for each significant association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy', fdr='dummy',
                company=self.company)

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            if str(assocs[i]).startswith("a"):
                html._filename = str(assocs[i]) + '.html'
            else:
                html._filename = "a" + str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            #html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_features(self):
        """Create an HTML page for each significant feature"""
        df = self.get_significant_set()
        groups = df.groupby('FEATURE')
        if self.verbose:
            print("Creating individual HTML pages for each feature")
        N = len(groups.indices.keys())
        pb = Progress(N)
        for i, feature in enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            subdf = groups.get_group(feature)
            html = HTMLOneFeature(self, self.df, subdf, feature)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_main(self, onweb=False):
        """Create HTML main document (summary)"""
        self._set_sensible_df()

        if self.verbose:
            print("Creating main HTML page in directory %s" %
                (self.settings.directory))
        ReportMain(directory=self.settings.directory, verbose=self.verbose)

        buffer_ = self.settings.savefig
        self.settings.savefig = True
        html = HTMLPageMain(self, 'index.html')
        html._init_report() # created the directory
        html.create_report(onweb=onweb)
        self.settings.savefig = buffer_

    def create_html_manova(self, onweb=True):
        """Create summary table with all significant hits

        :param onweb: open browser with the created HTML page.

        """
        df = self.get_significant_set()
        page = HTMLPageMANOVA(self, df, self.company)
        page.create_report(onweb)

    def create_html_pages(self, onweb=True):
        """Create all HTML pages"""
        self.create_html_main(onweb=onweb)
        self.create_html_manova(onweb=False)
        self.create_html_drugs()
        self.create_html_features()
        self.create_html_associations()

    def onweb(self):
        from easydev import onweb
        onweb(self.settings.directory + os.sep + 'index.html')