Example #1
0
    def compute_average(self, model_number=None):
        """Compute the average and update the cnograph accordingly

        :param int model_number: model_number as shown by :attr:`df.index`
            if not provided, the average is taken
        """
        if model_number is None:
            model = self.get_average_model()
        elif model_number == 'cv':
            model = self.get_cv_model()
        else:
            model = self.df.ix[model_number]

        # This is to set the average and label and penwidth
        # TODO: could be simplified using Reaction ?
        for edge in self.cnograph.edges(data=True):
            link = edge[2]['link']
            if and_symbol not in edge[0] and and_symbol not in edge[1]:
                if link == "-" :
                    name = "!" + edge[0] + "=" + edge[1]
                else:
                    name = edge[0] + "=" + edge[1]
                value = model[name]
            elif and_symbol in edge[0]:
                value = model[edge[0]]
            elif and_symbol in edge[1]:
                value = model[edge[1]]
            else:
                raise ValueError()
            self.cnograph.edge[edge[0]][edge[1]]["label"] = precision(value)
            self.cnograph.edge[edge[0]][edge[1]]["average"] = precision(value)

            # if values are between 0 and 1
            M = float(model.max())
            self.cnograph.edge[edge[0]][edge[1]]["penwidth"] = precision(value, 2) * 5/M
    def __init__(self, drugid, caller):
        self.drug = int(drugid)
        self.caller = caller

        filename = "drug_{0}.html".format(self.drug)
        super(HTMLOneDrug, self).__init__(
                directory=caller.output_dir,
                filename=filename, 
                template_filename='regression.html',
                init_report=False)
        self.title = 'Single Drug analysis (%s)' % self.drug
        self.params = {"drugid": self.drug}

        self.filename_template = self.caller.prefix + "%(name)s_" + "%s." % self.drug 
        results_filename = self.filename_template % {"name":"results"} + "json"

        with open(results_filename, "r") as fh:
            data = json.loads(fh.read())
        try:data["bayes"] = easydev.precision(data['bayes'], 3)
        except:pass
        try:data["alpha"] = easydev.precision(data['alpha'], 3)
        except:pass
        self.params.update(data)
        self.params['method'] = self.caller.method
        self.jinja['sections'] = []
Example #3
0
    def __init__(self, drugid, caller):
        self.drug = int(drugid)
        self.caller = caller

        filename = "drug_{0}.html".format(self.drug)
        super(HTMLOneDrug, self).__init__(directory=caller.output_dir,
                                          filename=filename,
                                          template_filename='regression.html',
                                          init_report=False)
        self.title = 'Single Drug analysis (%s)' % self.drug
        self.params = {"drugid": self.drug}

        filename_template = self.caller.prefix_data + "%(name)s_" + "%s." % self.drug
        results_filename = filename_template % {"name": "results"} + "json"

        with open(results_filename, "r") as fh:
            data = json.loads(fh.read())
        try:
            data["bayes"] = easydev.precision(data['bayes'], 3)
        except:
            pass
        try:
            data["alpha"] = easydev.precision(data['alpha'], 5)
        except:
            pass
        try:
            data["Rp"] = easydev.precision(data['Rp'], 4)
        except:
            pass
        self.params.update(data)
        self.params['method'] = self.caller.method
        self.jinja['sections'] = []
        self.jinja['goback'] = True
Example #4
0
    def plot(self, filename, vmin=None, vmax=None, cmap='jet_r'):
        pylab.clf()
        pylab.imshow(-np.log10(self.results[self._start_y:,:]), 
            origin="lower", 
            aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax)
        pylab.colorbar()

        # Fix xticks
        XMAX = float(self.results.shape[1])  # The max integer on xaxis
        xpos = list(range(0, int(XMAX), int(XMAX/5)))
        xx = [precision(this) for this in np.array(xpos) / XMAX * self.duration]
        pylab.xticks(xpos, xx, fontsize=16)

        # Fix yticks
        YMAX = float(self.results.shape[0])  # The max integer on xaxis
        ypos = list(range(0, int(YMAX), int(YMAX/5)))
        yy = [int(this) for this in np.array(ypos) / YMAX * self.sampling]   
        pylab.yticks(ypos, yy, fontsize=16)

        #pylab.yticks([1000,2000,3000,4000], [5500,11000,16500,22000], fontsize=16)
        #pylab.title("%s echoes" %  filename.replace(".png", ""), fontsize=25)
        pylab.xlabel("Time (seconds)", fontsize=25)
        pylab.ylabel("Frequence (Hz)", fontsize=25)
        pylab.tight_layout()
        pylab.savefig(filename)
Example #5
0
    def _get_html_stats(self):
        from sequana.tools import StatsBAM2Mapped
        from easydev import precision
        data = StatsBAM2Mapped(self.directory + "bwa_mem_stats.json").data
        html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3)

        # add HTML table
        if "R2_mapped" in data.keys():
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']],
              'R2': [data['R2_mapped'], data['R2_unmapped']]})
        else:
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']]})
        df.index = ['mapped', 'unmapped']

        datatable = DataTable(df, "bwa_bam")
        datatable.datatable.datatable_options = {
             'scrollX': '300px',
             'pageLength': 15,
             'scrollCollapse': 'true',
             'dom': 'irtpB',
             "paging": "false",
             'buttons': ['copy', 'csv']}
        js = datatable.create_javascript_function()
        html_tab = datatable.create_datatable(float_format='%.3g')
        #html += "{} {}".format(html_tab, js)

        html += "Unpaired: %s <br>" % data['unpaired']
        html += "duplicated: %s <br>" % data['duplicated']
        return html
Example #6
0
 def _get_html_summary_section(self):
     from easydev import precision
     data = self._get_summary()
     html = "Percentage of reads found with Phix: %s %%<br>" % precision(data['contamination'], 3)
     #html += "Unpaired: %s <br>" % data['unpaired']
     #html += "duplicated: %s <br>" % data['duplicated']
     return html
Example #7
0
 def prec(x):
     try:
         # this may fail if for instance x is nan or inf
         x = easydev.precision(x, self.pd_options['precision'])
         return x
     except:
         return x
Example #8
0
 def _get_html_summary_section(self):
     from easydev import precision
     data = self._get_summary()
     html = "Percentage of reads found with Phix: %s %%<br>" % precision(
         data['contamination'], 3)
     #html += "Unpaired: %s <br>" % data['unpaired']
     #html += "duplicated: %s <br>" % data['duplicated']
     return html
Example #9
0
    def multi_mapping(self,
                      fr="ID",
                      to="KEGG_ID",
                      query="P13368",
                      frmt="tab",
                      Nmax=100):
        """Calls mapping several times and concatenates results

        .. deprecated: 1.3.1 you can now use :meth:`mapping` even for long queries since
            we are now using a POST request, which allows arbitrary length of entries.
        """
        self.logging.warning(
            "deprecated in version 1.3.1. Use mapping instead")
        if isinstance(query, list) is False:
            query = [query]

        unique_entry_names = list(set(query))

        if len(unique_entry_names) > Nmax:
            unique_entry_names = list(unique_entry_names)
            self.logging.info(
                "There are more than %s unique species. Using multi stage uniprot mapping"
                % Nmax)
            mapping = {}
            # we need to split
            # this is a hack rigt now but could be put inside bioservices
            N, rest = divmod(len(unique_entry_names), Nmax)
            if rest > 0:
                N += 1
            for i in range(0, N):
                i1 = i * Nmax
                i2 = (i + 1) * Nmax
                if i2 > len(unique_entry_names):
                    i2 = len(unique_entry_names)
                query = ",".join(unique_entry_names[i1:i2])
                this_mapping = self.mapping(fr=fr, to=to, query=query)
                for k, v in this_mapping.items():
                    mapping[k] = v
                from easydev import precision
                self.logging.info(
                    str(precision((i + 1.) / N * 100., 2)) + "%% completed")
        else:
            #query=",".join([x+"_" + species for x in unique_entry_names])
            query = ",".join(unique_entry_names)
            mapping = self.mapping(fr=fr, to=to, query=query)
        return mapping
Example #10
0
    def to_html(self):
        data = self.data

        html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3)

        # add HTML table
        if "R2_mapped" in data.keys():
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']],
              'R2': [data['R2_mapped'], data['R2_unmapped']]})
        else:
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']]})
        df.index = ['mapped', 'unmapped']

        html += "Unpaired: %s <br>" % data['unpaired']
        html += "duplicated: %s <br>" % data['duplicated']
        return html
Example #11
0
    def _get_df_with_taxon(self, dbname):

        # line 14500
        # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome

        df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index])
        df['count'] = self.taxons.values
        df.reset_index(inplace=True)
        newrow = len(df)
        df.ix[newrow] = "Unclassified"
        df.ix[newrow, 'count'] = self.unclassified
        df.ix[newrow, 'index'] = -1
        df.rename(columns={"index": "taxon"}, inplace=True)
        df["percentage"] = df["count"] / df["count"].sum() * 100

        # Now get back all annotations from the database itself.
        filename = dbname + os.sep + "annotations.csv"
        if os.path.exists(filename):
            annotations = pd.read_csv(filename)
            annotations.set_index("taxon", inplace=True)

            df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']]
            # There are duplicates sohow. let us keep the first one for now
            df2 = df2.reset_index().drop_duplicates(
                subset="taxon", keep="first").set_index("taxon")
            self.df2 = df2
            self.df1 = df.set_index("taxon")
            df = pd.merge(self.df1, df2, left_index=True, right_index=True)
            df.reset_index(inplace=True)
            starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count']
            df = df[starter + [
                x
                for x in df.columns if x not in starter and x != "description"
            ] + ["description"]]

            df['gi'] = [int(x) for x in df['gi'].fillna(-1)]
            from easydev import precision
            df['percentage'] = [str(precision(x, 2)) for x in df['percentage']]
        else:
            starter = ['taxon', 'count', 'percentage']
            df = df[starter + [x for x in df.columns if x not in starter]]

        df.sort_values(by="percentage", inplace=True, ascending=False)
        return df
Example #12
0
    def to_html(self):
        data = self.data

        # TODO hardcoded word phix here ?
        html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3)

        # add HTML table
        if "R2_mapped" in data.keys():
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']],
              'R2': [data['R2_mapped'], data['R2_unmapped']]})
        else:
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']]})
        df.index = ['mapped', 'unmapped']

        html += "Unpaired: %s <br>" % data['unpaired']
        html += "duplicated: %s <br>" % data['duplicated']
        return html
Example #13
0
    def _get_df_with_taxon(self, dbname):

        # line 14500
        # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome

        df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index])
        df['count'] = self.taxons.values
        df.reset_index(inplace=True)
        newrow = len(df)
        df.ix[newrow] = "Unclassified"
        df.ix[newrow, 'count'] = self.unclassified
        df.ix[newrow, 'index'] = -1
        df.rename(columns={"index":"taxon"}, inplace=True)
        df["percentage"] = df["count"] / df["count"].sum() * 100

        # Now get back all annotations from the database itself.
        filename = dbname + os.sep + "annotations.csv"
        if os.path.exists(filename):
            annotations = pd.read_csv(filename)
            annotations.set_index("taxon", inplace=True)

            df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']]
            # There are duplicates sohow. let us keep the first one for now
            df2 = df2.reset_index().drop_duplicates(subset="taxon",
                keep="first").set_index("taxon")
            self.df2 = df2
            self.df1 = df.set_index("taxon")
            df = pd.merge(self.df1, df2, left_index=True, right_index=True)
            df.reset_index(inplace=True)
            starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count']
            df = df[starter + [x for x in df.columns if x not in starter and
                                x!="description"] +  ["description"]]

            df['gi'] = [int(x) for x in df['gi'].fillna(-1)]
            from easydev import precision
            df['percentage'] = [str(precision(x,2)) for x in df['percentage']]
        else:
            starter = ['taxon', 'count', 'percentage']
            df = df[starter + [x for x in df.columns if x not in starter]]

        df.sort_values(by="percentage", inplace=True, ascending=False)
        return df
Example #14
0
    def multi_mapping(self, fr="ID", to="KEGG_ID", query="P13368",
            frmt="tab", Nmax=100):
        """Calls mapping several times and concatenates results

        .. deprecated: 1.3.1 you can now use :meth:`mapping` even for long queries since
            we are now using a POST request, which allows arbitrary length of entries.
        """
        self.logging.warning("deprecated in version 1.3.1. Use mapping instead")
        if isinstance(query, list) is False:
            query = [query]

        unique_entry_names = list(set(query))

        if len(unique_entry_names) > Nmax:
            unique_entry_names = list(unique_entry_names)
            self.logging.info("There are more than %s unique species. Using multi stage uniprot mapping" % Nmax)
            mapping = {}
            # we need to split
            # this is a hack rigt now but could be put inside bioservices
            N, rest = divmod(len(unique_entry_names), Nmax)
            if rest>0:
                N+=1
            for i in range(0,N):
                i1 = i*Nmax
                i2 = (i+1)*Nmax
                if i2>len(unique_entry_names):
                    i2 = len(unique_entry_names)
                query=",".join(unique_entry_names[i1:i2])
                this_mapping = self.mapping(fr=fr, to=to, query=query)
                for k,v in this_mapping.items():
                    mapping[k] = v
                from easydev import precision
                self.logging.info(str(precision((i+1.)/N*100., 2)) + "%% completed")
        else:
            #query=",".join([x+"_" + species for x in unique_entry_names])
            query=",".join(unique_entry_names)
            mapping = self.mapping(fr=fr, to=to, query=query)
        return mapping
Example #15
0
    def _volcano_plot(self, data, title=''):
        """Main volcano plot function called by other methods
        such as volcano_plot_all"""
        # This functio is a bit complicated because it does create a few tricky
        # plots

        # It creates a volcano plot, which is the easy part
        # Then, it creates tooltips for the user interface in an IPython
        # shell using a callback to 'onpick' function coded here below
        # !! There seem to bes a memory leak in this function due to matplotlib
        # This is not easy to track down and should have no impact now that
        # ANOVAReport using JS instead of matplotlib 

        data = data.replace(np.inf, 0)
        data = data.replace(-np.inf, 0)

        colors = list(data['color'].values)
        pvalues = data['pvalue'].values
        signed_effects = data['signed_effect'].values
        markersize = data['markersize'].values

        Y = -np.log10(list(pvalues)) # should be cast to list ?

        num = 1
        #pylab.close(num)
        fig = pylab.figure(num=1)
        fig.clf()
        ax = fig.add_subplot(111)
        ax.set_axis_bgcolor('#EEEEEE')
        ax.cla()

        # TODO signed effects may be inf why ?


        X = [easydev.precision(x, digit=2) for x in signed_effects]
        Y = [easydev.precision(y, digit=2) for y in Y]

        # Using scatter() is slow as compared to plot()
        # However, plot cannot take different sizes/colors
        scatter = ax.scatter(X, Y, s=markersize,
                alpha=0.3, c=colors, linewidth=1, picker=True)
        scatter.set_zorder(11)

        m = abs(signed_effects.min())
        M = abs(signed_effects.max())
        pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize)
        pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize)
        l = max([m, M]) * 1.1
        pylab.xlim([-l, l])
        ax.grid(color='white', linestyle='solid')

        # some aliases
        fdr = self.settings.FDR_threshold
        if fdr < self.df[self._colname_qvalue].min():
            fdr = self.df[self._colname_qvalue].min()

        fdrs = sorted(self.settings.volcano_additional_FDR_lines)
        fdrs = fdrs[::-1] # reverse sorting

        styles = ['--', ':', '-.']
        if self.settings.volcano_FDR_interpolation is True:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
        else:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr

        pvalue = get_pvalue_from_fdr(fdr)
        ax.axhline(-np.log10(pvalue), linestyle='--', lw=2,
            color='red', alpha=1, label="FDR %s " %  fdr + " %")

        for i, this in enumerate(fdrs):
            if this < self.df[self._colname_qvalue].min() or\
                this > self.df[self._colname_qvalue].max():
                    continue
            pvalue = get_pvalue_from_fdr(this)
            ax.axhline(-np.log10(pvalue), linestyle=styles[i],
                color='red', alpha=1, label="FDR %s " % this +" %")

        pylab.ylim([0, pylab.ylim()[1]*1.2]) # times 1.2 to put the legend

        ax.axvline(0, color='gray', alpha=0.8, lw=2)
        axl = pylab.legend(loc='best')
        axl.set_zorder(10) # in case there is a circle behind the legend.

        #self.ax = ax
        #self.axx = ax.twinx()
        #self.common_ticks = ax.get_yticks()
        #self.common_ylim = ax.get_ylim()
        #pvals = self.df[self._colname_pvalue]
        #y1 = pvals.min()
        #y2 = pvals.max()
        #fdr1 = self._get_fdr_from_pvalue_interp(y1)
        #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists
        #self.axx.set_ylim([fdr2, fdr1])
        #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize)

        # For the static version
        title_handler = pylab.title("%s" % str(title).replace("_","  "),
                fontsize=self.settings.fontsize/1.2)
        labels = []

        # This code allows the ipython user to click on the matplotlib figure
        # to get information about the drug and feature of a given circles.
        def onpick(event):
            ind = event.ind[0]
            try:
                title = str(str(data.ix[ind]['Drug'])) + " / " + str(data.ix[ind].Feature)
                title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR']
                title_handler.set_text(title.replace("_","  "))
            except:
                print('Failed to create new title on click')
            print(data.ix[ind].T)
            fig.canvas.draw()

        # keep track on the id for further memory release
        # For more info search for "matplotlib memory leak mpl_connect"
        self.cid = fig.canvas.mpl_connect('pick_event', onpick)

        # for the JS version
        # TODO: for the first 1 to 2000 entries ?
        labels = []
        self.data = data
        for i, row in data[['Drug', 'Feature', 'FDR']].iterrows():

            template = """
<table border="1" class="dataframe">
  <tbody>
    <tr>
      <th>Drug</th>
      <td>%(Drug)s</td>
    </tr>
    <tr>
      <th>Feature</th>
      <td>%(Feature)s</td>
    </tr>
    <tr>
      <th>FDR</th>
      <td>%(FDR)s</td>
    </tr>
  </tbody>
</table>""" % row.to_dict()
            labels.append(template)

            # this is more elegant but slower
            #label = row.to_frame()
            #label.columns = ['Row {0}'.format(i)]
            #labels.append(str(label.to_html(header=False)))
        self.scatter = scatter
        self.current_fig = fig
        # not sure is this is required. could be a memory leak here
        import gc
        gc.collect()
Example #16
0
    def _add_patches(self, df, method, fill, ax, diagonal=True):
        width, height = df.shape
        labels = (df.columns)

        patches = []
        colors = []
        for x in range(width):
            for y in range(height):
                if fill == 'lower' and x > y:
                    continue
                elif fill == 'upper' and x < y:
                    continue
                if diagonal is False and x==y:
                    continue
                datum = (df.ix[x, y] +1.)/2.
                d = df.ix[x, y]
                d_abs = np.abs(d)
                #c = self.pvalues[x, y]
                rotate = -45 if d > 0 else +45
                #cmap = self.poscm if d >= 0 else self.negcm
                if method in ['ellipse', 'square', 'rectangle', 'color']:
                    if method == 'ellipse':
                        func = Ellipse
                        patch = func((x, y), width=1 * self.shrink,
                                  height=(self.shrink - d_abs*self.shrink), angle=rotate)
                    else:
                        func = Rectangle
                        w = h = d_abs * self.shrink
                        #FIXME shring must be <=1
                        offset = (1-w)/2.
                        if method == 'color':
                            w = 1
                            h = 1
                            offset = 0
                        patch = func((x + offset-.5, y + offset-.5), width=w,
                                  height=h, angle=0)
                    if self.edgecolor:
                        patch.set_edgecolor(self.edgecolor)
                    #patch.set_facecolor(cmap(d_abs))
                    colors.append(datum)
                    if d_abs > 0.05:
                        patch.set_linestyle('dotted')
                    #ax.add_artist(patch)
                    patches.append(patch)
                    #FIXME edgecolor is always printed
                elif method=='circle':
                    patch = Circle((x, y), radius=d_abs*self.shrink/2.)
                    if self.edgecolor:
                        patch.set_edgecolor(self.edgecolor)
                    #patch.set_facecolor(cmap(d_abs))
                    colors.append(datum)
                    if d_abs > 0.05:
                        patch.set_linestyle('dotted')
                    #ax.add_artist(patch)
                    patches.append(patch)
                elif method in ['number', 'text']:
                    from easydev import precision
                    #FIXME
                    if d<0:
                        edgecolor = 'red'
                    elif d>0:
                        edgecolor = 'blue'
                    ax.text(x,y, precision(d, 2), color=edgecolor,
                            fontsize=self.fontsize, horizontalalignment='center',
                            weight='bold', alpha=d_abs,
                            withdash=False)
                elif method == 'pie':
                    S = 360 * d_abs
                    patch = [
                        Wedge((x,y), 1*self.shrink/2., -90, S-90),
                        Wedge((x,y), 1*self.shrink/2., S-90, 360-90),
                        ]
                    #patch[0].set_facecolor(cmap(d_abs))
                    #patch[1].set_facecolor('white')
                    colors.append(datum)
                    colors.append(0.5)
                    if self.edgecolor:
                        patch[0].set_edgecolor(self.edgecolor)
                        patch[1].set_edgecolor(self.edgecolor)

                    #ax.add_artist(patch[0])
                    #ax.add_artist(patch[1])
                    patches.append(patch[0])                    
                    patches.append(patch[1])
        if len(patches):                    
            col1 = PatchCollection(patches, array=np.array(colors), cmap=self.cm)
            ax.add_collection(col1)

            self.collection = col1
Example #17
0
    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests) / (N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens + nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df
Example #18
0
    def _add_patches(self, df, method, fill, ax, diagonal=True):
        width, height = df.shape
        labels = (df.columns)

        patches = []
        colors = []
        for x in xrange(width):
            for y in xrange(height):
                if fill == 'lower' and x > y:
                    continue
                elif fill == 'upper' and x < y:
                    continue
                if diagonal is False and x==y:
                    continue
                datum = (df.ix[x, y] +1.)/2.
                d = df.ix[x, y]
                d_abs = np.abs(d)
                #c = self.pvalues[x, y]
                rotate = -45 if d > 0 else +45
                #cmap = self.poscm if d >= 0 else self.negcm
                if method in ['ellipse', 'square', 'rectangle', 'color']:
                    if method == 'ellipse':
                        func = Ellipse
                        patch = func((x, y), width=1 * self.shrink,
                                  height=(self.shrink - d_abs*self.shrink), angle=rotate)
                    else:
                        func = Rectangle
                        w = h = d_abs * self.shrink
                        #FIXME shring must be <=1
                        offset = (1-w)/2.
                        if method == 'color':
                            w = 1
                            h = 1
                            offset = 0
                        patch = func((x + offset-.5, y + offset-.5), width=w,
                                  height=h, angle=0)
                    if self.edgecolor:
                        patch.set_edgecolor(self.edgecolor)
                    #patch.set_facecolor(cmap(d_abs))
                    colors.append(datum)
                    if d_abs > 0.05:
                        patch.set_linestyle('dotted')
                    #ax.add_artist(patch)
                    patches.append(patch)
                    #FIXME edgecolor is always printed
                elif method=='circle':
                    patch = Circle((x, y), radius=d_abs*self.shrink/2.)
                    if self.edgecolor:
                        patch.set_edgecolor(self.edgecolor)
                    #patch.set_facecolor(cmap(d_abs))
                    colors.append(datum)
                    if d_abs > 0.05:
                        patch.set_linestyle('dotted')
                    #ax.add_artist(patch)
                    patches.append(patch)
                elif method in ['number', 'text']:
                    from easydev import precision
                    if d<0:
                        edgecolor = 'red'
                    elif d>=0:
                        edgecolor = 'blue'
                    ax.text(x,y, precision(d, 2), color=edgecolor,
                            fontsize=self.fontsize, horizontalalignment='center',
                            weight='bold', alpha=d_abs,
                            withdash=False)
                elif method == 'pie':
                    S = 360 * d_abs
                    patch = [
                        Wedge((x,y), 1*self.shrink/2., -90, S-90),
                        Wedge((x,y), 1*self.shrink/2., S-90, 360-90),
                        ]
                    #patch[0].set_facecolor(cmap(d_abs))
                    #patch[1].set_facecolor('white')
                    colors.append(datum)
                    colors.append(0.5)
                    if self.edgecolor:
                        patch[0].set_edgecolor(self.edgecolor)
                        patch[1].set_edgecolor(self.edgecolor)

                    #ax.add_artist(patch[0])
                    #ax.add_artist(patch[1])
                    patches.append(patch[0])                    
                    patches.append(patch[1])
        if len(patches):                    
            col1 = PatchCollection(patches, array=np.array(colors), cmap=self.cm)
            ax.add_collection(col1)

            self.collection = col1
Example #19
0
    def _volcano_plot(self, data, title=''):
        """Main volcano plot function called by other methods
        such as volcano_plot_all"""
        # This functio is a bit complicated because it does create a few tricky
        # plots

        # It creates a volcano plot, which is the easy part
        # Then, it creates tooltips for the user interface in an IPython
        # shell using a callback to 'onpick' function coded here below
        # finally, it creates a Javascript connection using mpld3 that
        # will allow the creation of a JS version of the plot.

        # !! There is a memory leak in this function due to matplotlib
        # This is not easy to track down.

        # You have to call clf() to make sure the content is erase.
        # One reason for the memory leak is that it is called in the
        # Report to loop over all drugs and then all featuers.
        # To see the memory leak, you will need to call the
        # volcano_plot_all_drugs function (or volcano_plot_all_features).
        colors = list(data['color'].values)
        pvalues = data['pvalue'].values
        signed_effects = data['signed_effect'].values
        markersize = data['markersize'].values

        Y = -np.log10(list(pvalues))  # should be cast to list ?

        num = 1
        #pylab.close(num)
        fig = pylab.figure(num=1)
        fig.clf()
        ax = fig.add_subplot(111)
        ax.set_axis_bgcolor('#EEEEEE')
        ax.cla()
        X = [easydev.precision(x, digit=2) for x in signed_effects]
        Y = [easydev.precision(y, digit=2) for y in Y]

        # Using scatter() is slow as compared to plot()
        # However, plot cannot take different sizes/colors
        scatter = ax.scatter(X,
                             Y,
                             s=markersize,
                             alpha=0.3,
                             c=colors,
                             linewidth=1,
                             picker=True)
        scatter.set_zorder(11)

        m = abs(signed_effects.min())
        M = abs(signed_effects.max())
        pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize)
        pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize)
        l = max([m, M]) * 1.1
        pylab.xlim([-l, l])
        ax.grid(color='white', linestyle='solid')

        # some aliases
        fdr = self.settings.FDR_threshold
        fdrs = sorted(self.settings.volcano_additional_FDR_lines)
        fdrs = fdrs[::-1]  # reverse sorting

        styles = ['--', ':', '-.']
        if self.settings.volcano_FDR_interpolation is True:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
        else:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr

        pvalue = get_pvalue_from_fdr(fdr)
        ax.axhline(-np.log10(pvalue),
                   linestyle='--',
                   lw=2,
                   color='red',
                   alpha=1,
                   label="FDR %s " % fdr + " \%")

        for i, this in enumerate(fdrs):
            if this < self.df[self._colname_qvalue].min() or\
                this > self.df[self._colname_qvalue].max():
                continue
            pvalue = get_pvalue_from_fdr(this)
            ax.axhline(-np.log10(pvalue),
                       linestyle=styles[i],
                       color='red',
                       alpha=1,
                       label="FDR %s " % this + " \%")

        pylab.ylim([0, pylab.ylim()[1] * 1.2])  # times 1.2 to put the legend

        ax.axvline(0, color='gray', alpha=0.8, lw=2)
        axl = pylab.legend(loc='best')
        axl.set_zorder(10)  # in case there is a circle behind the legend.

        #self.ax = ax
        #self.axx = ax.twinx()
        #self.common_ticks = ax.get_yticks()
        #self.common_ylim = ax.get_ylim()
        #pvals = self.df[self._colname_pvalue]
        #y1 = pvals.min()
        #y2 = pvals.max()
        #fdr1 = self._get_fdr_from_pvalue_interp(y1)
        #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists
        #self.axx.set_ylim([fdr2, fdr1])
        #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize)

        # For the static version
        title_handler = pylab.title("%s" % title.replace("_", "  "),
                                    fontsize=self.settings.fontsize / 1.2)
        labels = []

        # This code allows the ipython user to click on the matplotlib figure
        # to get information about the drug and feature of a given circles.
        def onpick(event):
            ind = event.ind[0]
            try:
                title = str(data.ix[ind]['Drug']) + " / " + str(
                    data.ix[ind].Feature)
                title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR']
                title_handler.set_text(title.replace("_", "  "))
            except:
                print('Failed to create new title on click')
            print(data.ix[ind].T)
            fig.canvas.draw()

        # keep track on the id for further memory release
        # For more info search for "matplotlib memory leak mpl_connect"
        self.cid = fig.canvas.mpl_connect('pick_event', onpick)

        # for the JS version
        # TODO: for the first 1 to 2000 entries ?
        labels = []

        self.data = data
        for i, row in data[['Drug', 'Feature', 'FDR']].iterrows():

            template = """
<table border="1" class="dataframe">
  <tbody>
    <tr>
      <th>Drug</th>
      <td>%(Drug)s</td>
    </tr>
    <tr>
      <th>Feature</th>
      <td>%(Feature)s</td>
    </tr>
    <tr>
      <th>FDR</th>
      <td>%(FDR)s</td>
    </tr>
  </tbody>
</table>""" % row.to_dict()
            labels.append(template)

            # this is more elegant but slower
            #label = row.to_frame()
            #label.columns = ['Row {0}'.format(i)]
            #labels.append(str(label.to_html(header=False)))
        css = """
        svg.mpld3-figure { border: 2px black solid;margin:10px;}
        table{  font-size:0.8em;  }
        th {  color: #ffffff;  background-color: #aaaaaa;  }
        td { color: blue; background-color: #cccccc; }"""

        try:
            import mpld3
            tooltip = mpld3.plugins.PointHTMLTooltip(scatter,
                                                     labels=labels,
                                                     css=css)
            mpld3.plugins.connect(fig, tooltip)
        except:
            print("Issue with javascript version of the volcano plot. Skipped")
        self.scatter = scatter
        self.current_fig = fig
        # not sure is this is required. could be a memory leak here
        import gc
        gc.collect()
Example #20
0
    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests)/(N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens+nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df