Exemple #1
0
    def __init__(self,
                 gdsc,
                 results=None,
                 sep="\t",
                 drug_decode=None,
                 verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf: 0, -np.inf: 0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None
    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups
    def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, 
            verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df # this does a copy and sanity check


        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)


        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf:0, -np.inf:0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None
    def __init__(self, gdsc, results, sep="\t", drug_decode=None):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`

        """
        self.figtools = Savefig()
        self.gdsc = gdsc
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)

        # create some data
        self._set_sensible_df()

        # just to create the directory
        ReportMAIN(directory=self.settings.directory)
Exemple #5
0
    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups
    def __init__(self, gdsc, results, sep="\t", drug_decode=None):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`

        """
        self.figtools = Savefig()
        self.gdsc = gdsc
        self.df = ANOVAResults(results).df # this does a copy and sanity check

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)

        # create some data
        self._set_sensible_df()

        # just to create the directory
        ReportMAIN(directory=self.settings.directory)
Exemple #7
0
class VolcanoANOVA(object):
    """Utilities related to volcano plots

    This class is used in :mod:`gdsctools.anova` but can also
    be used independently as in the example below.

    .. plot::
        :include-source:
        :width: 80%

        from gdsctools import ANOVA, ic50_test, VolcanoANOVA
        an = ANOVA(ic50_test)

        # retrict analysis to a tissue to speed up computation
        an.set_cancer_type('lung_NSCLC')

        # Perform the entire analysis
        results = an.anova_all()

        # Plot volcano plot of pvalues versus signed effect size
        v = VolcanoANOVA(results)
        v.volcano_plot_all()

    .. note:: Within an IPython shell, you should be able to click
        on a circle and the title will be updated
        with the name of the drug/feature and FDR value.

    :Legend and color conventions: The green circles indicate significant hits
        that are resistant while reds show sensitive hits. Circles are colored
        if there are below the FDR_threshold AND below the pvalue_threshold AND
        if the signed effect size is above the effect_threshold.



    """
    _colname_pvalue = 'ANOVA_FEATURE_pval'
    _colname_qvalue = 'ANOVA_FEATURE_FDR'
    _colname_drugid = 'DRUG_ID'
    _colname_feature = 'FEATURE'
    _colname_deltas = 'FEATURE_delta_MEAN_IC50'
    _colname_effect_size = 'FEATURE_IC50_effect_size'
    _colname_N_feature_pos = 'N_FEATURE_pos'

    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups

    def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False):
        """Select only the first N best rows and N random ones

        Sometimes, there are tens of thousands of associations and future
        analysis will include more features and drugs. Plotting volcano plots
        should therefore be fast and scalable. Here, we provide a naive
        way of speeding up the plotting by selecting only a subset of the data
        made of Nbest+Nrandom associations.

        :param df: the input dataframe with ANOVAResults
        :param int Nbest: how many of the most significant association
            should be kept
        :param int Nrandom: on top of the Nbest significant association,
            set how many other randomly chosen associations are to be kept.
        :return: pruned dataframe

        """
        if len(df) < Nbest:
            return df
        Nmax = Nbest + Nrandom
        N = len(df)
        if N > Nbest:
            x = range(Nbest, N)
            pylab.shuffle(x)
            n2pick = min(N, Nmax) - Nbest
            indices = range(0, Nbest) + x[0:n2pick]
        else:
            indices = range(0, Nbest)
        # indices in the index may not be order
        indices = [df.index[xx] for xx in indices]
        df = df.ix[indices]
        if inplace is True:
            self.df = df
        else:
            return df

    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i+1)

    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig("volcano_%s.png" % feature,
                    size_inches=(10, 10))
            pb.animate(i+1)

    def volcano_plot_all(self):
        """Create an overall volcano plot for all associations

        This method saves the picture in a PNG file named **volcano_all.png**.
        """
        # no annotations for all features.
        # this is slow, we can drop non relevant data
        data = self._get_volcano_sub_data('ALL')
        data['annotation'] = ['' for x in range(len(data))]

        self._volcano_plot(data, title='all drugs all features')

    def _get_fdr_from_pvalue_interp(self, pvalue):
        """Here, FDR are computed using an interpolation"""
        pvalue += 1e-15
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        ya = qvals[pvals < pvalue].max()
        yb = qvals[pvals > pvalue].min()
        xa = pvals[pvals < pvalue].max()
        xb = pvals[pvals > pvalue].min()
        dx = xb - xa
        dy = yb - ya
        yc = ya + dy * (pvalue - xa) / dx
        return yc

    def _get_pvalue_from_fdr(self, fdr):
        """Get pvalue for a given FDR threshold

        This is equivalent to v17 of the R version but is not very precise
        we should use _get_pvalue_from_fdr_interp instead.

        """
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        if isinstance(fdr, list):
            pvalues = [pvals[qvals < this].max() for this in fdr]
            return pvalues
        else:
            return pvals[qvals < fdr].max()

    def _get_pvalue_from_fdr_interp(self, fdr):
        # same as get_pvalue_from_fdr but with a linear interpolation
        fdr += 1e-15

        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        ya = pvals[qvals <= fdr].max()
        yb = pvals[qvals > fdr].min()
        xa = qvals[qvals <= fdr].max()
        xb = qvals[qvals > fdr].min()

        dx = xb - xa
        dy = yb - ya
        xc = fdr
        yc = ya + dy * (xc - xa) / dx
        return yc

    def _get_volcano_global_data(self):
        # using all data
        colname = self._colname_N_feature_pos
        minN = self.df[colname].min()
        maxN = self.df[colname].max()
        pvalues = self._get_pvalue_from_fdr(self.settings.FDR_threshold)
        return {'minN': minN, 'maxN': maxN,
                'pvalues': (self.settings.FDR_threshold, pvalues)}

    def _get_volcano_sub_data(self, mode, target=None):
        # Return data needed for each plot
        # TODO could be simplified but works for now

        # groups created in the constructor once for all
        if mode == self._colname_drugid:
            subdf = self.df.ix[self.groups_by_drugs[target]]
            texts = subdf[self._colname_feature]
        elif mode == 'FEATURE':
            subdf = self.df.ix[self.groups_by_features[target]]
            texts = subdf[self._colname_drugid]
        elif mode == 'ALL':
            # nothing to do, get all data
            subdf = self.df
            texts = subdf[self._colname_feature] # TODO + drug
        else:
            raise ValueError("mode parameter must be in [FEATURE, %s, ALL]" %
                    (self._colname_drugid))

        # replaced by groups created in the constructor
        #subdf = self.df[self.df[mode] == target]
        deltas = subdf[self._colname_deltas]
        effects = subdf[self._colname_effect_size]
        signed_effects = list(np.sign(deltas) * effects)
        qvals = list(subdf[self._colname_qvalue])
        pvals = list(subdf[self._colname_pvalue])
        #assocs = list(subdf['ASSOC_ID'])

        colors = []
        annotations = []

        data = pd.DataFrame(index=range(len(qvals)))
        data['pvalue'] = pvals
        data['signed_effect'] = signed_effects
        data['Feature'] = list(subdf[self._colname_feature])
        data['Drug'] = list(subdf[self._colname_drugid])
        data['text'] = texts.values
        #data['Assoc'] = assocs
        ## !! here, we need to use .values since the pandas dataframe
        # index goes from 1 to N but the origignal indices in subdf
        # may not be from 1 to N but random between 1 and M>>N
        data['FDR'] = subdf[self._colname_qvalue].values
        annotations = []

        # just an alias
        # FIXME: why do we have a switch here for PANCAN ?
        FDR_threshold = self.settings.FDR_threshold
        if self.settings.analysis_type == 'PANCAN':
            for sign, qval, pval in zip(signed_effects, qvals, pvals):
                if sign <= -self.settings.effect_threshold and \
                        qval <= FDR_threshold and \
                        pval <= self.settings.pvalue_threshold:
                    colors.append('green')
                    annotations.append(True)
                elif sign >= self.settings.effect_threshold and \
                        qval <= FDR_threshold and \
                        pval <= self.settings.pvalue_threshold:
                    colors.append('red')
                    annotations.append(True)
                else:
                    colors.append('black')
                    annotations.append(False)
        else:
            for delta, qval, pval in zip(deltas, qvals, pvals):
                if pval <= self.settings.pvalue_threshold and \
                        qval <= FDR_threshold and delta < 0: 
                    colors.append('green')
                    annotations.append(True)
                elif pval <= self.settings.pvalue_threshold and \
                        qval <= FDR_threshold and delta > 0:
                    colors.append('red')
                    annotations.append(True)
                else:
                    colors.append('black')
                    annotations.append(False)

        # here we normalise wrt the drug. In R code, normalised
        # my max across all data (minN, maxN)
        colname = self._colname_N_feature_pos
        markersize = subdf[colname] / subdf[colname].max()
        markersize = list(markersize * 800)
        markersize = [x if x > 80 else 80 for x in markersize]

        data['color'] = colors
        data['annotation'] = annotations
        data['markersize'] = markersize
        return data

    def volcano_plot_one_feature(self, feature):
        """Volcano plot for one feature (all drugs)

        :param feature: a valid feature name to be found in the results
        """
        assert feature in self.features, 'unknown feature name'
        # FEATURE is the mode's name, not a column's name
        data = self._get_volcano_sub_data('FEATURE', feature)
        self._volcano_plot(data, title=feature)

    def volcano_plot_one_drug(self, drug_id):
        """Volcano plot for one drug (all genomic features)

        :param drug_id: a valid drug identifier to be found in the results
        """
        assert drug_id in self.drugs, 'unknown drug name'
        data = self._get_volcano_sub_data(self._colname_drugid, drug_id)
        self._volcano_plot(data, title=drug_id)

    def _volcano_plot(self, data, title=''):
        """Main volcano plot function called by other methods
        such as volcano_plot_all"""
        # This functio is a bit complicated because it does create a few tricky
        # plots

        # It creates a volcano plot, which is the easy part
        # Then, it creates tooltips for the user interface in an IPython
        # shell using a callback to 'onpick' function coded here below
        # !! There seem to bes a memory leak in this function due to matplotlib
        # This is not easy to track down and should have no impact now that
        # ANOVAReport using JS instead of matplotlib 

        data = data.replace(np.inf, 0)
        data = data.replace(-np.inf, 0)

        colors = list(data['color'].values)
        pvalues = data['pvalue'].values
        signed_effects = data['signed_effect'].values
        markersize = data['markersize'].values

        Y = -np.log10(list(pvalues)) # should be cast to list ?

        num = 1
        #pylab.close(num)
        fig = pylab.figure(num=1)
        fig.clf()
        ax = fig.add_subplot(111)
        ax.set_axis_bgcolor('#EEEEEE')
        ax.cla()

        # TODO signed effects may be inf why ?


        X = [easydev.precision(x, digit=2) for x in signed_effects]
        Y = [easydev.precision(y, digit=2) for y in Y]

        # Using scatter() is slow as compared to plot()
        # However, plot cannot take different sizes/colors
        scatter = ax.scatter(X, Y, s=markersize,
                alpha=0.3, c=colors, linewidth=1, picker=True)
        scatter.set_zorder(11)

        m = abs(signed_effects.min())
        M = abs(signed_effects.max())
        pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize)
        pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize)
        l = max([m, M]) * 1.1
        pylab.xlim([-l, l])
        ax.grid(color='white', linestyle='solid')

        # some aliases
        fdr = self.settings.FDR_threshold
        if fdr < self.df[self._colname_qvalue].min():
            fdr = self.df[self._colname_qvalue].min()

        fdrs = sorted(self.settings.volcano_additional_FDR_lines)
        fdrs = fdrs[::-1] # reverse sorting

        styles = ['--', ':', '-.']
        if self.settings.volcano_FDR_interpolation is True:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
        else:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr

        pvalue = get_pvalue_from_fdr(fdr)
        ax.axhline(-np.log10(pvalue), linestyle='--', lw=2,
            color='red', alpha=1, label="FDR %s " %  fdr + " %")

        for i, this in enumerate(fdrs):
            if this < self.df[self._colname_qvalue].min() or\
                this > self.df[self._colname_qvalue].max():
                    continue
            pvalue = get_pvalue_from_fdr(this)
            ax.axhline(-np.log10(pvalue), linestyle=styles[i],
                color='red', alpha=1, label="FDR %s " % this +" %")

        pylab.ylim([0, pylab.ylim()[1]*1.2]) # times 1.2 to put the legend

        ax.axvline(0, color='gray', alpha=0.8, lw=2)
        axl = pylab.legend(loc='best')
        axl.set_zorder(10) # in case there is a circle behind the legend.

        #self.ax = ax
        #self.axx = ax.twinx()
        #self.common_ticks = ax.get_yticks()
        #self.common_ylim = ax.get_ylim()
        #pvals = self.df[self._colname_pvalue]
        #y1 = pvals.min()
        #y2 = pvals.max()
        #fdr1 = self._get_fdr_from_pvalue_interp(y1)
        #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists
        #self.axx.set_ylim([fdr2, fdr1])
        #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize)

        # For the static version
        title_handler = pylab.title("%s" % str(title).replace("_","  "),
                fontsize=self.settings.fontsize/1.2)
        labels = []

        # This code allows the ipython user to click on the matplotlib figure
        # to get information about the drug and feature of a given circles.
        def onpick(event):
            ind = event.ind[0]
            try:
                title = str(str(data.ix[ind]['Drug'])) + " / " + str(data.ix[ind].Feature)
                title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR']
                title_handler.set_text(title.replace("_","  "))
            except:
                print('Failed to create new title on click')
            print(data.ix[ind].T)
            fig.canvas.draw()

        # keep track on the id for further memory release
        # For more info search for "matplotlib memory leak mpl_connect"
        self.cid = fig.canvas.mpl_connect('pick_event', onpick)

        # for the JS version
        # TODO: for the first 1 to 2000 entries ?
        labels = []
        self.data = data
        for i, row in data[['Drug', 'Feature', 'FDR']].iterrows():

            template = """
<table border="1" class="dataframe">
  <tbody>
    <tr>
      <th>Drug</th>
      <td>%(Drug)s</td>
    </tr>
    <tr>
      <th>Feature</th>
      <td>%(Feature)s</td>
    </tr>
    <tr>
      <th>FDR</th>
      <td>%(FDR)s</td>
    </tr>
  </tbody>
</table>""" % row.to_dict()
            labels.append(template)

            # this is more elegant but slower
            #label = row.to_frame()
            #label.columns = ['Row {0}'.format(i)]
            #labels.append(str(label.to_html(header=False)))
        self.scatter = scatter
        self.current_fig = fig
        # not sure is this is required. could be a memory leak here
        import gc
        gc.collect()

    def savefig(self, filename, size_inches=(10, 10)):
        # Save the PNG first. The savefig automatically set the size
        # to a defined set and back to original figsize.
        self.figtools.savefig(filename + '.png', size_inches=size_inches)
Exemple #8
0
class ANOVAReport(object):
    """Class used to interpret the results and create final HTML report

    Results is a data structure returned by :meth:`ANOVA.anova_all`.

    ::

        from gdsctools import *

        # Perform the analysis itself to get a set of results (dataframe)
        an = ANOVA(ic50_test)
        results = an.anova_all()

        # now, we can create the report.
        r = ANOVAReport(gdsc=an, results=results)

        # we can tune some settings
        r.settings.pvalue_threshold = 0.001
        r.settings.FDR_threshold = 28
        r.settings.directory = 'testing'
        r.create_html_pages()


    .. rubric:: Significant association

    An association is significant if

      - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold
      - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold

    It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is 
    below 0, otherwise it is **resistant**.


    """
    def __init__(self,
                 gdsc,
                 results=None,
                 sep="\t",
                 drug_decode=None,
                 verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf: 0, -np.inf: 0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

        # just to create the directory
        # ReportMain(directory=self.settings.directory, verbose=self.verbose)

    def _get_ndrugs(self):
        return len(self.df[self._colname_drug_id].unique())

    n_drugs = property(_get_ndrugs, doc="return number of drugs")

    def _get_ntests(self):
        return len(self.df.index)

    n_tests = property(_get_ntests)

    def _get_ncelllines(self):
        return len(self.gdsc.features.df.index)

    n_celllines = property(_get_ncelllines, doc="return number of cell lines")

    def _df_append(self, df, data):
        count = len(df)
        df.loc[count] = data
        return df

    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests) / (N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens + nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df

    def _get_pval_range(self):
        """Get pvalues range of the significant hits"""
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        N = nsens + nres
        if N == 0:
            return 0., 0.
        name = self.varname_pval
        data = self.df[name].iloc[0:N]
        m, M = data.min(), data.max()
        return m, M

    def _get_fdr_range(self):
        """Get FDR range of the significant hits"""
        name = self.varname_qval
        data = self.df[name][(self.df[name] < self.settings.FDR_threshold)]
        if len(data) == 0:
            return 0., 0.
        m, M = data.min(), data.max()
        return m, M

    def _set_sensible_df(self):
        # just an alias
        logand = np.logical_and

        # select sensible data set
        mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold
        mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0
        self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)]

        # select resistant data set
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0
        self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]

    def get_significant_set(self):
        """Return significant hits (resistant and sensible)"""
        # a property that is long to compute
        # and may change if FDR changes.
        self._set_sensible_df()
        df = pd.concat([self.sensible_df, self.resistant_df])
        try:
            df.sort_values('ASSOC_ID', inplace=True)
        except:
            df.sort('ASSOC_ID', inplace=True)
        return df

    def _get_data(self, df_count_sensible, df_count_resistant):
        # we can drop all columns except one, which is renamed as count
        df1 = df_count_sensible['ASSOC_ID']
        df1.name = 'sens assoc'
        df2 = df_count_resistant['ASSOC_ID']
        df2.name = 'res assoc'

        # Now, we join the two TimeSeries (note that above, we selected only
        # one column so the dataframe was downcast to time series)
        df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer')
        # and set NA to zero
        df_count.fillna(0, inplace=True)
        # let us add a new column with the total
        df_count['total'] = df_count['sens assoc'] + df_count['res assoc']

        # we want to sort by 'total' column and is equality by the name,
        # which is the index. So let us add the index temporarily as
        # a column, sort, and remove 'name' column afterwards
        df_count['name'] = df_count.index
        try:
            df_count.sort_values(by=['total', 'name'],
                                 ascending=[False, True],
                                 inplace=True)
        except:
            df_count.sort(columns=['total', 'name'],
                          ascending=[False, True],
                          inplace=True)

        df_count.drop('name', axis=1, inplace=True)
        return df_count

    def get_drug_summary_data(self):
        """Return dataframe with drug summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()

        # group by drug
        colname = self._colname_drug_id
        df_count_sensible = self.sensible_df.groupby(colname).count()
        df_count_resistant = self.resistant_df.groupby(colname).count()

        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def drug_summary(self, top=50, fontsize=15, filename=None):
        """Return dataframe with significant drugs and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """

        df_count = self.get_drug_summary_data()

        if len(df_count):
            self._plot(df_count, 'drug', top)
            fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename,
                                  size_inches=(12, 14),
                                  bbox_inches='tight')
        return df_count

    def get_feature_summary_data(self):
        """Return dataframe with feature summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()
        df_count_sensible = self.sensible_df.groupby('FEATURE').count()
        df_count_resistant = self.resistant_df.groupby('FEATURE').count()
        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def feature_summary(self, filename=None, top=50, fontsize=15):
        """Return dataframe with significant features and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """
        df_count = self.get_feature_summary_data()

        if len(df_count) > 0:
            self._plot(df_count, 'feature', top)
            #fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename,
                                  set_inches=(12, 14),
                                  bbox_inches='tight')
        return df_count

    def _plot(self, df_count, title_tag, top):
        """Used by drug_summary and feature_summary to plot the
        bar plot"""
        if top > len(df_count):
            top = len(df_count)

        df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']]
        labels = list(df.index)
        # add drug name
        if len(self.drug_decode) > 0:
            for i, label in enumerate(labels):
                if title_tag == 'drug':
                    name = self.drug_decode.get_name(label)
                    if name is not None:
                        labels[i] = "{}-{}".format(labels[i], name)
                else:
                    pass

        labels = [str(x).replace('_', ' ') for x in labels]
        # restrict size to first 30 characters
        labels = [x[0:30] for x in labels]
        ind = range(0, len(labels))
        # reverse does not exist with python3
        try:
            ind.reverse()
        except:
            ind = list(ind)
            ind.reverse()
        data1 = df['sens assoc'].values
        data2 = df['res assoc'].values
        pylab.figure(1)
        pylab.clf()
        p1 = pylab.barh(ind,
                        data1,
                        height=0.8,
                        color='purple',
                        label='sensitivity')
        p2 = pylab.barh(ind,
                        data2,
                        height=0.8,
                        color='orange',
                        left=data1,
                        label='resistance')
        ax = pylab.gca()
        self.labels = labels
        ax.set_yticks([x + 0.5 for x in ind])
        ax.set_yticklabels(labels, fontsize=12)

        xticks = ax.get_xticks()
        ax.set_xticklabels(
            [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks])

        pylab.grid()
        pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \
                    "\nassociated with drug  response",
                    fontsize=self.settings.fontsize/1.2)
        pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' %
                     ("$>$", self.settings.FDR_threshold, "$\%$"),
                     fontsize=16)

        M = max(data1 + data2)
        #ax.set_xticks()
        #ax.set_xticklabels(labels, fontsize=fontsize)
        ax.set_xlim([0, M + 1])
        pylab.legend(loc='lower right')
        try:
            pylab.tight_layout()
        except:
            pass

    def get_significant_hits(self, show=True):
        """Return a summary of significant hits

        :param show: show a plot with the distribution of significant hits

        .. todo:: to finalise
        """
        fdrs = range(5, 50 + 1, 5)

        significants = []
        significant_meaningful = []
        strong_hits = []
        full_strong_hits = []

        MC1 = 1
        MC2 = 2
        mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1
        mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2
        mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1
        mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2
        maskMC = mask2 + mask3 + mask4 + mask5

        for fdr in fdrs:
            # significant hits
            res = self.df['ANOVA_FEATURE_FDR'] < fdr
            significants.append(res.sum())

            # meaningful hits
            indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr,
                                     maskMC)
            significant_meaningful.append(indices.sum())

            # meaningful strong hits
            mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1
            strong_hits.append(np.logical_or(mask1, mask2).sum())

            # meaningful full strong hits
            mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1
            full_strong_hits.append(np.logical_and(mask1, mask2).sum())

        data = {
            'significants': significants,
            'full_strong_hits': full_strong_hits,
            'strong_hits': strong_hits,
            'significant_meaningful': significant_meaningful
        }

        df = pd.DataFrame(data,
                          columns=[
                              'significants', 'significant_meaningful',
                              'strong_hits', 'full_strong_hits'
                          ],
                          index=fdrs)
        df.columns = [
            '1) significant', '2) 1 + meaningful', '3) 2 + strong',
            '4) 2+ very strong'
        ]

        if show is True:
            pylab.clf()
            ax = pylab.gca()
            df.plot(kind='bar',
                    width=.8,
                    color=['r', 'gray', 'orange', 'black'],
                    rot=0,
                    ax=ax)
            pylab.grid()
        # original is 'aquamarine4','cyan2','cornflowerblue    ','aquamarine'),
        return df

    def __str__(self):
        self.df.info()
        return ""

    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        if self.verbose:
            print(
                "Creating individual HTML pages for each significant association"
            )
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self,
                           drug='dummy',
                           feature='dummy',
                           fdr='dummy',
                           company=self.company)

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            if str(assocs[i]).startswith("a"):
                html._filename = str(assocs[i]) + '.html'
            else:
                html._filename = "a" + str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            #html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_features(self):
        """Create an HTML page for each significant feature"""
        df = self.get_significant_set()
        groups = df.groupby('FEATURE')
        if self.verbose:
            print("Creating individual HTML pages for each feature")
        N = len(groups.indices.keys())
        pb = Progress(N)
        for i, feature in enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            subdf = groups.get_group(feature)
            html = HTMLOneFeature(self, self.df, subdf, feature)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_main(self, onweb=False):
        """Create HTML main document (summary)"""
        self._set_sensible_df()

        if self.verbose:
            print("Creating main HTML page in directory %s" %
                  (self.settings.directory))
        ReportMain(directory=self.settings.directory, verbose=self.verbose)

        buffer_ = self.settings.savefig
        self.settings.savefig = True
        html = HTMLPageMain(self, 'index.html')
        html._init_report()  # created the directory
        html.create_report(onweb=onweb)
        self.settings.savefig = buffer_

    def create_html_manova(self, onweb=True):
        """Create summary table with all significant hits

        :param onweb: open browser with the created HTML page.

        """
        df = self.get_significant_set()
        page = HTMLPageMANOVA(self, df, self.company)
        page.create_report(onweb)

    def create_html_pages(self, onweb=True):
        """Create all HTML pages"""
        self.create_html_main(onweb=onweb)
        self.create_html_manova(onweb=False)
        self.create_html_drugs()
        self.create_html_features()
        self.create_html_associations()

    def onweb(self):
        from easydev import onweb
        onweb(self.settings.directory + os.sep + 'index.html')
class VolcanoANOVA(object):
    """Utilities related to volcano plots

    This class is used in :mod:`gdsctools.anova` but can also
    be used independently as in the example below.

    .. plot::
        :include-source:
        :width: 80%

        from gdsctools import ANOVA, ic50_test, VolcanoANOVA
        an = ANOVA(ic50_test)

        # retrisct analysis to a tissue to speed up computation
        an.set_cancer_type('lung_NSCLC')

        # Perform the entire analysis
        results = an.anova_all()

        # Plot volcano plot of pvalues versus signed effect size
        v = VolcanoANOVA(results)
        v.volcano_plot_all()

    .. note:: Within an IPython shell, you should be able to click
        on a circle and the title will be updated
        with the name of the drug/feature and FDR value.

    .. note:: (**for developers**) A javascript version is also
        created on the fly using mpld3 library. It is used in the
        creation of the HTML report but one can use it as well in an
        ipython notebook::

            # The **v** instance is as created in the example above
            # Then, type the following code to create an HTML with the
            # javascript plot embedded.
            import mpld3
            htmljs = mpld3.fig_to_html(v.current_fig)
            fh = open('volcano_doc.html', 'w')
            fh.write(htmljs)
            fh.close()

    There are 5 methods to plot volcano plots depending on what you want to see

    - :meth:`volcano_plot_all` as above plots all associations
    - :meth:`volcano_plot_all_drugs` creates a volcano plot for each drug and
      save it into a PNG file. This method calls :meth:`volcano_plot_one_drug`.
    - :meth:`volcano_plot_all_features` creates a volcano plot for each feature
      and save it into a PNG file. This method calls
      :meth:`volcano_plot_one_feature`.

    """
    _colname_pvalue = 'ANOVA_FEATURE_pval'
    _colname_qvalue = 'ANOVA_FEATURE_FDR'
    _colname_drugid = 'DRUG_ID'
    _colname_feature = 'FEATURE'
    _colname_deltas = 'FEATURE_delta_MEAN_IC50'
    _colname_effect_size = 'FEATURE_IC50_effect_size'
    _colname_N_feature_pos = 'N_FEATURE_pos'

    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups

    def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False):
        """Select only the first N best rows and N random ones

        Sometimes, there are tens of thousands of associations and future
        analysis will include more features and drugs. Plotting volcano plots
        should therefore be fast and scalable. Here, we provide a naive
        way of speeding up the plotting by selecting only a subset of the data
        made of Nbest+Nrandom associations.

        :param df: the input dataframe with ANOVAResults
        :param int Nbest: how many of the most significant association
            should be kept
        :param int Nrandom: on top of the Nbest significant association,
            set how many other randomly chosen associations are to be kept.
        :return: pruned dataframe

        """
        if len(df) < Nbest:
            return df
        Nmax = Nbest + Nrandom
        N = len(df)
        if N > Nbest:
            x = range(Nbest, N)
            pylab.shuffle(x)
            n2pick = min(N, Nmax) - Nbest
            indices = range(0, Nbest) + x[0:n2pick]
        else:
            indices = range(0, Nbest)
        # indices in the index may not be order
        indices = [df.index[xx] for xx in indices]
        df = df.ix[indices]
        if inplace is True:
            self.df = df
        else:
            return df

    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i + 1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass

    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig_and_js("volcano_%s.png" % feature,
                                size_inches=(10, 10))
            pb.animate(i + 1)

            # This prevent memory leak.
            self.current_fig.canvas.mpl_disconnect(self.cid)
            try:
                import mpld3
                mpld3.plugins.clear(self.current_fig)
            except:
                pass

    def volcano_plot_all(self):
        """Create an overall volcano plot for all associations

        This method saves the picture in a PNG file named **volcano_all.png**.
        """
        # no annotations for all features.
        # this is slow, we can drop non relevant data
        data = self._get_volcano_sub_data('ALL')
        data['annotation'] = ['' for x in range(len(data))]

        self._volcano_plot(data, title='all drugs all features')

    def _get_fdr_from_pvalue_interp(self, pvalue):
        """Here, FDR are computed using an interpolation"""
        pvalue += 1e-15
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        ya = qvals[pvals < pvalue].max()
        yb = qvals[pvals > pvalue].min()
        xa = pvals[pvals < pvalue].max()
        xb = pvals[pvals > pvalue].min()
        dx = xb - xa
        dy = yb - ya
        yc = ya + dy * (pvalue - xa) / dx
        return yc

    def _get_pvalue_from_fdr(self, fdr):
        """Get pvalue for a given FDR threshold

        This is equivalent to v17 of the R version but is not very precise
        we should use _get_pvalue_from_fdr_interp instead but needs to be
        tested.

        """
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        if isinstance(fdr, list):
            pvalues = [pvals[qvals < this].max() for this in fdr]
            return pvalues
        else:
            return pvals[qvals < fdr].max()

    def _get_pvalue_from_fdr_interp(self, fdr):
        # same as get_pvalue_from_fdr but with a linear inerpolation
        fdr += 1e-15
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        ya = pvals[qvals < fdr].max()
        yb = pvals[qvals > fdr].min()
        xa = qvals[qvals < fdr].max()
        xb = qvals[qvals > fdr].min()
        dx = xb - xa
        dy = yb - ya
        xc = fdr
        yc = ya + dy * (xc - xa) / dx
        return yc

    def _get_volcano_global_data(self):
        # using all data
        colname = self._colname_N_feature_pos
        minN = self.df[colname].min()
        maxN = self.df[colname].max()
        pvalues = self._get_pvalue_from_fdr(self.settings.FDR_threshold)
        return {
            'minN': minN,
            'maxN': maxN,
            'pvalues': (self.settings.FDR_threshold, pvalues)
        }

    def _get_volcano_sub_data(self, mode, target=None):
        # Return data needed for each plot
        # TODO could be simplified but works for now

        # groups created in the constructor once for all
        if mode == self._colname_drugid:
            subdf = self.df.ix[self.groups_by_drugs[target]]
            texts = subdf[self._colname_feature]
        elif mode == 'FEATURE':
            subdf = self.df.ix[self.groups_by_features[target]]
            texts = subdf[self._colname_drugid]
        elif mode == 'ALL':
            # nothing to do, get all data
            subdf = self.df
            texts = subdf[self._colname_feature]  # TODO + drug
        else:
            raise ValueError("mode parameter must be in [FEATURE, %s, ALL]" %
                             (self._colname_drugid))

        # replaced by groups created in the constructor
        #subdf = self.df[self.df[mode] == target]
        deltas = subdf[self._colname_deltas]
        effects = subdf[self._colname_effect_size]
        signed_effects = list(np.sign(deltas) * effects)
        qvals = list(subdf[self._colname_qvalue])
        pvals = list(subdf[self._colname_pvalue])
        #assocs = list(subdf['ASSOC_ID'])

        colors = []
        annotations = []

        data = pd.DataFrame(index=range(len(qvals)))
        data['pvalue'] = pvals
        data['signed_effect'] = signed_effects
        data['Feature'] = list(subdf[self._colname_feature])
        data['Drug'] = list(subdf[self._colname_drugid])
        data['text'] = texts.values
        #data['Assoc'] = assocs
        ## !! here, we need to use .values since the pandas dataframe
        # index goes from 1 to N but the origignal indices in subdf
        # may not be from 1 to N but random between 1 and M>>N
        data['FDR'] = subdf[self._colname_qvalue].values
        annotations = []

        # just an alias
        FDR_threshold = self.settings.FDR_threshold
        if self.settings.analysis_type == 'PANCAN':
            for sign, qval, pval in zip(signed_effects, qvals, pvals):
                if sign <= -self.settings.effect_threshold and \
                        qval <= FDR_threshold:
                    colors.append('green')
                    annotations.append(True)
                elif sign >= self.settings.effect_threshold and \
                        qval <= FDR_threshold:
                    colors.append('red')
                    annotations.append(True)
                else:
                    colors.append('black')
                    annotations.append(False)
        else:
            for delta, qval, pval in zip(deltas, qvals, pvals):
                if pval <= self.settings.pvalue_threshold and \
                        qval <= FDR_threshold and delta < 0:
                    colors.append('green')
                    annotations.append(True)
                elif pval <= self.settings.pvalue_threshold and \
                        qval <= FDR_threshold and delta > 0:
                    colors.append('red')
                    annotations.append(True)
                else:
                    colors.append('black')
                    annotations.append(False)

        # here we normalise wrt the drug. In R code, normalised
        # my max across all data (minN, maxN)
        colname = self._colname_N_feature_pos
        markersize = subdf[colname] / subdf[colname].max()
        markersize = list(markersize * 800)
        markersize = [x if x > 80 else 80 for x in markersize]

        data['color'] = colors
        data['annotation'] = annotations
        data['markersize'] = markersize
        return data

    def volcano_plot_one_feature(self, feature):
        """Volcano plot for one feature (all drugs)

        :param feature: a valid feature name to be found in the results
        """
        assert feature in self.features, 'unknown feature name'
        # FEATURE is the mode's name, not a column's name
        data = self._get_volcano_sub_data('FEATURE', feature)
        self._volcano_plot(data, title=feature)

    def volcano_plot_one_drug(self, drug_id):
        """Volcano plot for one drug (all genomic features)

        :param drug_id: a valid drug identifier to be found in the results
        """
        assert drug_id in self.drugs, 'unknown drug name'
        data = self._get_volcano_sub_data(self._colname_drugid, drug_id)
        self._volcano_plot(data, title=drug_id)

    def _volcano_plot(self, data, title=''):
        """Main volcano plot function called by other methods
        such as volcano_plot_all"""
        # This functio is a bit complicated because it does create a few tricky
        # plots

        # It creates a volcano plot, which is the easy part
        # Then, it creates tooltips for the user interface in an IPython
        # shell using a callback to 'onpick' function coded here below
        # finally, it creates a Javascript connection using mpld3 that
        # will allow the creation of a JS version of the plot.

        # !! There is a memory leak in this function due to matplotlib
        # This is not easy to track down.

        # You have to call clf() to make sure the content is erase.
        # One reason for the memory leak is that it is called in the
        # Report to loop over all drugs and then all featuers.
        # To see the memory leak, you will need to call the
        # volcano_plot_all_drugs function (or volcano_plot_all_features).
        colors = list(data['color'].values)
        pvalues = data['pvalue'].values
        signed_effects = data['signed_effect'].values
        markersize = data['markersize'].values

        Y = -np.log10(list(pvalues))  # should be cast to list ?

        num = 1
        #pylab.close(num)
        fig = pylab.figure(num=1)
        fig.clf()
        ax = fig.add_subplot(111)
        ax.set_axis_bgcolor('#EEEEEE')
        ax.cla()
        X = [easydev.precision(x, digit=2) for x in signed_effects]
        Y = [easydev.precision(y, digit=2) for y in Y]

        # Using scatter() is slow as compared to plot()
        # However, plot cannot take different sizes/colors
        scatter = ax.scatter(X,
                             Y,
                             s=markersize,
                             alpha=0.3,
                             c=colors,
                             linewidth=1,
                             picker=True)
        scatter.set_zorder(11)

        m = abs(signed_effects.min())
        M = abs(signed_effects.max())
        pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize)
        pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize)
        l = max([m, M]) * 1.1
        pylab.xlim([-l, l])
        ax.grid(color='white', linestyle='solid')

        # some aliases
        fdr = self.settings.FDR_threshold
        fdrs = sorted(self.settings.volcano_additional_FDR_lines)
        fdrs = fdrs[::-1]  # reverse sorting

        styles = ['--', ':', '-.']
        if self.settings.volcano_FDR_interpolation is True:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
        else:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr

        pvalue = get_pvalue_from_fdr(fdr)
        ax.axhline(-np.log10(pvalue),
                   linestyle='--',
                   lw=2,
                   color='red',
                   alpha=1,
                   label="FDR %s " % fdr + " \%")

        for i, this in enumerate(fdrs):
            if this < self.df[self._colname_qvalue].min() or\
                this > self.df[self._colname_qvalue].max():
                continue
            pvalue = get_pvalue_from_fdr(this)
            ax.axhline(-np.log10(pvalue),
                       linestyle=styles[i],
                       color='red',
                       alpha=1,
                       label="FDR %s " % this + " \%")

        pylab.ylim([0, pylab.ylim()[1] * 1.2])  # times 1.2 to put the legend

        ax.axvline(0, color='gray', alpha=0.8, lw=2)
        axl = pylab.legend(loc='best')
        axl.set_zorder(10)  # in case there is a circle behind the legend.

        #self.ax = ax
        #self.axx = ax.twinx()
        #self.common_ticks = ax.get_yticks()
        #self.common_ylim = ax.get_ylim()
        #pvals = self.df[self._colname_pvalue]
        #y1 = pvals.min()
        #y2 = pvals.max()
        #fdr1 = self._get_fdr_from_pvalue_interp(y1)
        #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists
        #self.axx.set_ylim([fdr2, fdr1])
        #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize)

        # For the static version
        title_handler = pylab.title("%s" % title.replace("_", "  "),
                                    fontsize=self.settings.fontsize / 1.2)
        labels = []

        # This code allows the ipython user to click on the matplotlib figure
        # to get information about the drug and feature of a given circles.
        def onpick(event):
            ind = event.ind[0]
            try:
                title = str(data.ix[ind]['Drug']) + " / " + str(
                    data.ix[ind].Feature)
                title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR']
                title_handler.set_text(title.replace("_", "  "))
            except:
                print('Failed to create new title on click')
            print(data.ix[ind].T)
            fig.canvas.draw()

        # keep track on the id for further memory release
        # For more info search for "matplotlib memory leak mpl_connect"
        self.cid = fig.canvas.mpl_connect('pick_event', onpick)

        # for the JS version
        # TODO: for the first 1 to 2000 entries ?
        labels = []

        self.data = data
        for i, row in data[['Drug', 'Feature', 'FDR']].iterrows():

            template = """
<table border="1" class="dataframe">
  <tbody>
    <tr>
      <th>Drug</th>
      <td>%(Drug)s</td>
    </tr>
    <tr>
      <th>Feature</th>
      <td>%(Feature)s</td>
    </tr>
    <tr>
      <th>FDR</th>
      <td>%(FDR)s</td>
    </tr>
  </tbody>
</table>""" % row.to_dict()
            labels.append(template)

            # this is more elegant but slower
            #label = row.to_frame()
            #label.columns = ['Row {0}'.format(i)]
            #labels.append(str(label.to_html(header=False)))
        css = """
        svg.mpld3-figure { border: 2px black solid;margin:10px;}
        table{  font-size:0.8em;  }
        th {  color: #ffffff;  background-color: #aaaaaa;  }
        td { color: blue; background-color: #cccccc; }"""

        try:
            import mpld3
            tooltip = mpld3.plugins.PointHTMLTooltip(scatter,
                                                     labels=labels,
                                                     css=css)
            mpld3.plugins.connect(fig, tooltip)
        except:
            print("Issue with javascript version of the volcano plot. Skipped")
        self.scatter = scatter
        self.current_fig = fig
        # not sure is this is required. could be a memory leak here
        import gc
        gc.collect()

    def mpld3_to_html(self):
        """This require to call a plotting figure before hand"""
        from gdsctools import gdsctools_data
        # This copy the full path and therefore HTML cannot
        # be moved in another directory. to be fixed.
        js_path1 = gdsctools_data('d3.v3.min.js', where='javascript')
        js_path2 = gdsctools_data('mpld3.v0.2.js', where='javascript')
        try:
            # mpld3 is great but there are a couple of issues
            # 1 - legend zorder is not used so dots may be below the legend,
            #     hence we set the framealpha =0.5
            # 2 - % character even though there well interpreted in matploltib
            #     using \%, they are not once parsed by mpld3. So, here
            #     we remove the \ character
            axl = pylab.legend(loc='best', framealpha=0.8, borderpad=1)
            axl.set_zorder(10)  # in case there is a circle behind the legend.
            texts = [this.get_text() for this in axl.get_texts()]

            for i, text in enumerate(texts):
                text = text.replace("\\%", "%")
                text += "  "
                axl.get_texts()[i].set_text(text)
            import mpld3
            htmljs = mpld3.fig_to_html(self.current_fig,
                                       d3_url=js_path1,
                                       mpld3_url=js_path2)
        except:
            htmljs = ""
        return """<div class="jsimage"> """ + htmljs + "</div>"

    def savefig_and_js(self, filename, size_inches=(10, 10)):
        # Save the PNG first. The savefig automatically set the size
        # to a defined set and back to original figsize.

        self.figtools.savefig(filename + '.png', size_inches=size_inches)

        # now the javascript.
        fig = self.current_fig
        oldsize = fig.get_size_inches()
        fig.set_size_inches(size_inches)

        htmljs = self.mpld3_to_html()
        fh = open(self.settings.directory + os.sep + filename + ".html", "w")
        fh.write(htmljs)
        fh.close()
        fig.set_size_inches(*oldsize)
class ANOVAReport(object):
    """Class used to interpret the results and create final HTML report

    Results is a data structure returned by :meth:`ANOVA.anova_all`.

    ::

        from gdsctools import *

        # Perform the analysis itself to get a set of results (dataframe)
        an = ANOVA(ic50_test)
        results = an.anova_all()

        # now, we can create the report.
        r = ANOVAReport(gdsc=an, results=results)

        # we can tune some settings
        r.settings.pvalue_threshold = 0.001
        r.settings.FDR_threshold = 28
        r.settings.directory = 'testing'
        r.create_html_pages()


    .. rubric:: Significant association

    An association is significant if

      - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold
      - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold
    It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is 
    below 0, otherwise it is **resistant**.


    """
    def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, 
            verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df # this does a copy and sanity check


        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)


        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf:0, -np.inf:0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

        # just to create the directory
        # ReportMain(directory=self.settings.directory, verbose=self.verbose)

    def _get_ndrugs(self):
        return len(self.df[self._colname_drug_id].unique())
    n_drugs = property(_get_ndrugs, doc="return number of drugs")

    def _get_ntests(self):
        return len(self.df.index)
    n_tests = property(_get_ntests)

    def _get_ncelllines(self):
        return len(self.gdsc.features.df.index)
    n_celllines = property(_get_ncelllines,
            doc="return number of cell lines")

    def _df_append(self, df, data):
        count = len(df)
        df.ix[count] = data
        return df

    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests)/(N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens+nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df

    def _get_pval_range(self):
        """Get pvalues range of the significant hits"""
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        N = nsens + nres
        if N == 0:
            return 0., 0.
        name = self.varname_pval
        data = self.df[name].ix[0:N-1]
        m, M = data.min(), data.max()
        return m, M

    def _get_fdr_range(self):
        """Get FDR range of the significant hits"""
        name = self.varname_qval
        data = self.df[name][(self.df[name] < self.settings.FDR_threshold)]
        if len(data) == 0:
            return 0., 0.
        m, M = data.min(), data.max()
        return m, M

    def _set_sensible_df(self):
        # just an alias
        logand = np.logical_and

        # select sensible data set
        mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold
        mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0
        self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)]

        # select resistant data set
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0
        self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]

    def get_significant_set(self):
        """Return significant hits (resistant and sensible)"""
        # a property that is long to compute
        # and may change if FDR changes.
        self._set_sensible_df()
        df = pd.concat([self.sensible_df, self.resistant_df])
        try:
            df.sort_values('ASSOC_ID', inplace=True)
        except:
            df.sort('ASSOC_ID', inplace=True)
        return df

    def _get_data(self, df_count_sensible, df_count_resistant):
        # we can drop all columns except one, which is renamed as count
        df1 = df_count_sensible['ASSOC_ID']
        df1.name = 'sens assoc'
        df2 = df_count_resistant['ASSOC_ID']
        df2.name = 'res assoc'

        # Now, we join the two TimeSeries (note that above, we selected only
        # one column so the dataframe was downcast to time series)
        df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer')
        # and set NA to zero
        df_count.fillna(0, inplace=True)
        # let us add a new column with the total
        df_count['total'] = df_count['sens assoc'] + df_count['res assoc']

        # we want to sort by 'total' column and is equality by the name,
        # which is the index. So let us add the index temporarily as
        # a column, sort, and remove 'name' column afterwards
        df_count['name'] = df_count.index
        try:
            df_count.sort_values(by=['total', 'name'], 
                    ascending=[False, True], inplace=True)
        except:
            df_count.sort(columns=['total', 'name'], 
                    ascending=[False, True], inplace=True)

        df_count.drop('name', axis=1, inplace=True)
        return df_count

    def get_drug_summary_data(self):
        """Return dataframe with drug summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()

        # group by drug
        colname = self._colname_drug_id
        df_count_sensible = self.sensible_df.groupby(colname).count()
        df_count_resistant = self.resistant_df.groupby(colname).count()

        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def drug_summary(self,  top=50, fontsize=15, filename=None):
        """Return dataframe with significant drugs and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """

        df_count = self.get_drug_summary_data()

        if len(df_count):
            self._plot(df_count, 'drug', top)
            fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename, size_inches=(12, 14),
                    bbox_inches='tight')
        return df_count

    def get_feature_summary_data(self):
        """Return dataframe with feature summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()
        df_count_sensible = self.sensible_df.groupby('FEATURE').count()
        df_count_resistant = self.resistant_df.groupby('FEATURE').count()
        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def feature_summary(self, filename=None, top=50, fontsize=15):
        """Return dataframe with significant features and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """
        df_count = self.get_feature_summary_data()

        if len(df_count) > 0:
            self._plot(df_count, 'feature', top)
            #fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename, set_inches=(12, 14),
                    bbox_inches='tight')
        return df_count

    def _plot(self, df_count, title_tag, top):
        """Used by drug_summary and feature_summary to plot the
        bar plot"""
        if top > len(df_count):
            top = len(df_count)

        df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']]
        labels = list(df.index)
        # add drug name
        if len(self.drug_decode) > 0:
            for i, label in enumerate(labels):
                if title_tag == 'drug':
                    name = self.drug_decode.get_name(label)
                    if name is not None:
                        labels[i] = "{}-{}".format(labels[i], name)
                else:
                    pass

        labels = [str(x).replace('_', ' ') for x in labels]
        # restrict size to first 30 characters
        labels = [x[0:30] for x in labels]
        ind = range(0, len(labels))
        # reverse does not exist with python3
        try:
            ind.reverse()
        except:
            ind = list(ind)
            ind.reverse()
        data1 = df['sens assoc'].values
        data2 = df['res assoc'].values
        pylab.figure(1)
        pylab.clf()
        p1 = pylab.barh(ind, data1, height=0.8, color='purple',
                        label='sensitivity')
        p2 = pylab.barh(ind, data2, height=0.8, color='orange',
                        left=data1, label='resistance')
        ax = pylab.gca()
        self.labels = labels
        ax.set_yticks([x + 0.5 for x in ind])
        ax.set_yticklabels(labels, fontsize=12)

        xticks = ax.get_xticks()
        ax.set_xticklabels(
                [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks])


        pylab.grid()
        pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \
                    "\nassociated with drug  response",
                    fontsize=self.settings.fontsize/1.2)
        pylab.xlabel(r'Number of significant associations (FDR %s %s %s) '
                     % ("$>$", self.settings.FDR_threshold, "$\%$"),
                     fontsize=18)

        M = max(data1+data2)
        #ax.set_xticks()
        #ax.set_xticklabels(labels, fontsize=fontsize)
        ax.set_xlim([0, M+1])
        pylab.legend(loc='lower right')
        try:pylab.tight_layout()
        except:pass

    def get_significant_hits(self,  show=True):
        """Return a summary of significant hits

        :param show: show a plot with the distribution of significant hits

        .. todo:: to finalise
        """
        fdrs = range(5, 50+1, 5)

        significants = []
        significant_meaningful = []
        strong_hits = []
        full_strong_hits = []

        MC1 = 1
        MC2 = 2
        mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1
        mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2
        mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1
        mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2
        maskMC = mask2 + mask3 + mask4 + mask5

        for fdr in fdrs:
            # significant hits
            res = self.df['ANOVA_FEATURE_FDR'] < fdr
            significants.append(res.sum())

            # meaningful hits
            indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr,
                    maskMC)
            significant_meaningful.append(indices.sum())

            # meaningful strong hits
            mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1
            strong_hits.append(np.logical_or(mask1, mask2).sum())

            # meaningful full strong hits
            mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1
            full_strong_hits.append(np.logical_and(mask1, mask2).sum())

        data = {'significants': significants,
                'full_strong_hits': full_strong_hits,
               'strong_hits': strong_hits,
               'significant_meaningful': significant_meaningful}

        df = pd.DataFrame(data, columns = ['significants',
            'significant_meaningful', 'strong_hits', 'full_strong_hits'],
            index=fdrs)
        df.columns = ['1) significant', '2) 1 + meaningful',
        '3) 2 + strong', '4) 2+ very strong']

        if show is True:
            pylab.clf()
            ax = pylab.gca()
            df.plot(kind='bar', width=.8,
                    color=['r', 'gray', 'orange', 'black'],
                    rot=0, ax=ax)
            pylab.grid()
        # original is 'aquamarine4','cyan2','cornflowerblue    ','aquamarine'),
        return df

    def __str__(self):
        self.df.info()
        return ""

    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        if self.verbose:
            print("Creating individual HTML pages for each significant association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy', fdr='dummy',
                company=self.company)

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            if str(assocs[i]).startswith("a"):
                html._filename = str(assocs[i]) + '.html'
            else:
                html._filename = "a" + str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            #html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_features(self):
        """Create an HTML page for each significant feature"""
        df = self.get_significant_set()
        groups = df.groupby('FEATURE')
        if self.verbose:
            print("Creating individual HTML pages for each feature")
        N = len(groups.indices.keys())
        pb = Progress(N)
        for i, feature in enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            subdf = groups.get_group(feature)
            html = HTMLOneFeature(self, self.df, subdf, feature)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_main(self, onweb=False):
        """Create HTML main document (summary)"""
        self._set_sensible_df()

        if self.verbose:
            print("Creating main HTML page in directory %s" %
                (self.settings.directory))
        ReportMain(directory=self.settings.directory, verbose=self.verbose)

        buffer_ = self.settings.savefig
        self.settings.savefig = True
        html = HTMLPageMain(self, 'index.html')
        html._init_report() # created the directory
        html.create_report(onweb=onweb)
        self.settings.savefig = buffer_

    def create_html_manova(self, onweb=True):
        """Create summary table with all significant hits

        :param onweb: open browser with the created HTML page.

        """
        df = self.get_significant_set()
        page = HTMLPageMANOVA(self, df, self.company)
        page.create_report(onweb)

    def create_html_pages(self, onweb=True):
        """Create all HTML pages"""
        self.create_html_main(onweb=onweb)
        self.create_html_manova(onweb=False)
        self.create_html_drugs()
        self.create_html_features()
        self.create_html_associations()

    def onweb(self):
        from easydev import onweb
        onweb(self.settings.directory + os.sep + 'index.html')