Beispiel #1
0
    def worker((i, r)):
        #     betas = [3.0] * 25
        #     betas  = getBeta(i)
        nIter = 100
        alias = 'i-%d_r-%d' % (i, r)

        mdl0 = pyjob.job__cluster__mixtureVMF__incr(
            normalizeSample=0,  #### set to 1 to normalize the vector lenght
            tdf=tdf,
            meanNorm=1,  ##### perform X = X-E(X)_
            weighted=True,
            init_method='random',
            nIter=nIter,
            #         start=0.001, #### specify temperature range
            #         end=2.0,
            #         end=0.7,
            start=0.2,  #### specify temperature range
            #         end=2.0,
            end=0.7,

            #         betas = betas, #### alternatively, pass a callable for temperature
            randomState=r,
            alias='mdl_' + alias,  #### filename of cache produced
            verbose=2,
            K=60,
        )

        ##### produce diagnostic plot
        YCUT = entropy_cutoff = 2.5
        XCUT = step = 30

        axs = pycbk.qc__vmf__speed(
            mdl0,
            #                                XCUT=step,YCUT=entropy_cutoff  ### not working yet
        )
        fig = plt.gcf()
        ax = fig.axes[0]
        #     pyvis.abline(y0=3.7,k=0,ax=ax)
        pyvis.abline(y0=YCUT, k=0, ax=ax)
        pyvis.abline(x0=XCUT, k=0, ax=ax)
        figs['diagnostic-plot'] = plt.gcf()

        #### using the last model to predict cluster
        mdls = mdl0.callback.mdls  #### models is recorded for each point
        mdl = mdls[step][-1]  #### getting the model at step
        clu = mdl.predictClu(tdf, entropy_cutoff=entropy_cutoff)
        clu.to_csv('cluster.csv')  ### getting cluster assignment

        pyvis.heatmap(tdf.reindex(clu.sort_values('clu').index),
                      figsize=[14, 7])
        figs['clustered-heatmap'] = plt.gcf()
        return (alias, fig)
Beispiel #2
0
    def heatmap(self,
                C=None,
                vlim=None,
                cname='test',
                reorder=0,
                ax=None,
                **kwargs):
        vlim = self.vlim if vlim is None else vlim
        reorder and self.reorder()
        C = self.values if C is None else C
        condName = self.colName_short()
        #         im = pyvis.heatmap(C[cidx][sidx],
        im = pyvis.heatmap(
            C,
            #                            ylab=(None if not i else 'Gene'),
            #                            ytick = (None if not i else gCur['Gene Name']),
            xlab='Condition',
            xtick=condName,
            transpose=1,
            cname=cname,
            vlim=vlim,
            ax=ax,
            **kwargs)

        if 0:
            addBox(dd=self, ax=im.axes)
        else:
            #             print '[DEV] addBox() is disabled fodr now'
            pass
        return im
Beispiel #3
0
    def heatmap(
            self,
            C=None,
            vlim=None,
            cname='test',
            xlab='Condition',
            ylab='',
            ytick=None,
            xtick=None,
            reorder=0,
            ax=None,
            transpose=0,  ### [buggy] pymisca updated transpose
            tickMax=100,
            short=1,
            **kwargs):
        vlim = self.vlim if vlim is None else vlim
        reorder and self.reorder()
        C = self.values if C is None else C
        C = C.T  ### [buggy] pymisca updated transpose

        if xtick is None:
            xtick = self.colName_short() if short else self.columns

        if ytick is None:
            ytick = self.index if len(self) < 500 else None

        if transpose:
            xtick, ytick = ytick, xtick
            xlab, ylab = ylab, xlab

#         im = pyvis.heatmap(C[cidx][sidx],
        im = pyvis.heatmap(
            C,
            #                            ylab=(None if not i else 'Gene'),
            #                            ytick = (None if not i else gCur['Gene Name']),
            xlab=xlab,
            ylab=ylab,
            transpose=transpose,
            cname=cname,
            vlim=vlim,
            xtick=xtick,
            ytick=ytick,
            tickMax=tickMax,
            ax=ax,
            **kwargs)

        if 0:
            addBox(dd=self, ax=im.axes)
        else:
            #             print '[DEV] addBox() is disabled fodr now'
            pass
        return im
Beispiel #4
0
def qc_Sort(df=None,
            fname=None,
            cname='test',
            vlim=[-2, 2],
            title=None,
            xlim=None,
            ylim=None,
            figsize2=[14, 6],
            nMax=5000,
            **heatargs):
    figs = collections.OrderedDict()
    vmin, vmax = vlim
    if df is None:
        df = pyutil.readData(fname)
        if title is None:
            title = '[file]%s' % fname
    heatargs.update({
        'vmin': vmin,
        'vmax': vmax,
        'cname': cname,
        'vlim': vlim,
    })
    if isinstance(df, pd.DataFrame):
        C = df.values
    else:
        C = df
    (M, V, CV), axsLst = qcAvg(C, silent=0, xlim=xlim, ylim=ylim, nMax=nMax)
    figs['qcAvg'] = plt.gcf()

    plt.suptitle(title)
    inter = -len(C) // 1000

    fig, axs = plt.subplots(3,
                            1,
                            figsize=figsize2,
                            gridspec_kw={'hspace': 0.3})
    axs = axs.flat
    pyvis.heatmap(C[V.argsort()][::inter],
                  transpose=1,
                  main='sorted by Varaince',
                  ax=axs[0],
                  **heatargs)

    pyvis.heatmap(C[CV.argsort()][::inter],
                  transpose=1,
                  main='sorted by CV',
                  ax=axs[1],
                  **heatargs)

    pyvis.heatmap(C[M.argsort()][::inter],
                  transpose=1,
                  main='sorted by Average',
                  ax=axs[2],
                  **heatargs)

    axsLst = np.hstack([axsLst, axs])
    figs['qcSort'] = plt.gcf()

    return (M, V, CV), figs
Beispiel #5
0
def qc_GeneExpr(exprMat,
                idx=None,
                gene=None,
                gRef=None,
                id_col='Gene Name',
                show_ytick=None,
                condName=None,
                **kwargs):
    if idx is None:
        assert not (gene is None or gRef is None
                    ), 'Must specify "gene" and "meta" when "idx" not provided'
        ### Query dataframe with id
        qRes = pyutil.gQuery(gene, gRef, id_col=id_col)
        idx = qRes.index
    show_ytick = show_ytick or len(idx) <= 100
    if gene is not None:
        ytick = gene.values
    elif gRef is not None:
        ytick = gRef.loc[idx][id_col]
    else:
        ytick = idx
        print '[WARN] ytick not defined'

    if condName is None:
        xtick = None
    elif isinstance(condName, pd.DataFrame):
        xtick = meta2name(condName)
    else:
        xtick = condName
    ax = pyvis.heatmap(exprMat[idx],
                       xlab='Sample ID',
                       ylab='Gene',
                       xtick=xtick,
                       ytick=ytick if show_ytick else None,
                       **kwargs)
    return ax
Beispiel #6
0
def figs__peakBW(
        peakFile,
        bwFiles,
        outerRadius=500,
        innerRadius=50,
        stepSize=10,
        center_summit=0,
        outIndexFunc=None,
        outIndex=pyutil.basename,
        ylim=[0, None],
        NCORE=4,
        squareSize=(0.2, 0.01),
        detailByChip=None,
        detailByGene=0,  ### runtime linear in number of genes
        name=None,
        **kwargs):
    ### legacy
    if outIndexFunc is not None:
        assert outIndex is None
        outIndex = outIndexFunc
    else:
        assert outIndex is not None
        pass
    #### get the data ready
    bwTracks = sdio.extract_bigwig_multiple(bedFile=peakFile,
                                            fnames=bwFiles,
                                            radius=outerRadius,
                                            NCORE=NCORE,
                                            stepSize=stepSize,
                                            outIndex=outIndex,
                                            center_summit=center_summit,
                                            **kwargs)
    if detailByChip is None:
        if len(bwTracks) <= 100:
            detailByChip = 1
        else:
            detailByChip = 0

    if name is None:
        name = pyutil.getBname(peakFile)

    poss = bwTracks.columns.levels[1]
    innerPos = poss[abs(poss) <= innerRadius]

    bwTracks = bwTracks.T.query('~index.duplicated()').T
    #     [bwTracks.columns.drop_duplicates()]

    #     bwTracks = bwTracks[bwTracks.columns.drop_duplicates()]

    bwAvg = pyutil.colGroupMean(bwTracks.reindex(columns=innerPos, level=1))
    bwAvg = scount.countMatrix(bwAvg).apply(pyutil.log2p1)

    ##### plotting
    #     fig,axs = plt.subplots(1,2,figsize=[12,6])
    #     figs = {}
    figs = pyutil.collections.OrderedDict()
    #########
    #     plt.figure(figsize=[6,4])
    fig, axs = plt.subplots(1, 3, figsize=[18, 6])

    ax = axs[0]
    bwAvg.boxplot(rot='vertical', ax=ax)
    ax.set_ylabel('log2(peak intensity)')
    ax.set_ylim(ylim)

    ax = axs[1]
    sqc.qc_pileUp(bwTracks, ax=ax, axLeg=axs[2])
    figs['pileUp-%s' % name] = plt.gcf()
    ax.set_ylim(ylim)

    #########
    bwAvg.heatmap(figsize=[20, 10])
    figs['avgHeatmap-%s' % name] = plt.gcf()

    pos = bwTracks.columns.get_level_values('pos')
    cname = 'binding'
    #########
    if detailByChip:
        for key, bwTrack in bwTracks.groupby(axis=1, level=0):
            dfc = bwTrack
            pyvis.heatmap(dfc,
                          transpose=0,
                          squareSize=(0.025, 0.2),
                          ytick=dfc.index,
                          xlab='distance to %s' % key,
                          vlim=ylim,
                          cname=cname)

            ax = plt.gca()
            xticks = pos[ax.get_xticks().astype(int)[:-1]]
            ax.set_xticklabels(xticks, )

            #             pyvis.heatmap(bwTrack,
            #                           transpose=1,
            #                           squareSize=squareSize,
            #                           xtick = bwTracks.index,ylab = key)
            figs['detailByChip-%s/%s' % (name, key)] = plt.gcf()

    ########
    if detailByGene:

        for key, bwTrack in bwTracks.groupby(axis=0, level=0):
            dfc = bwTrack.melt().pivot_table(
                index='bwFile',
                columns='pos',
            )
            pyvis.heatmap(
                dfc,
                transpose=0,
                squareSize=(0.025, 0.2),
                ytick=dfc.index,
                #                           xtick = dfc.columns,
                xlab='distance to %s' % key,
                vlim=ylim,
                cname=cname)
            ax = plt.gca()
            xticks = pos[ax.get_xticks().astype(int)[:-1]]
            ax.set_xticklabels(xticks, )

            figs['detailByGene-%s/%s' % (name, key)] = plt.gcf()

    return figs, (bwTracks, bwAvg)
Beispiel #7
0
def qc_dist(D,
            vlim=None,
            silent=1,
            axs=None,
            cutoff=0.75,
            reorder=1,
            method='average',
            distance_sort='descending',
            level=5):
    '''Plot a distance matrix. Perform hierarchical clustering if reorder == True
'''
    if D.ndim == 1:
        D = spdist.squareform(D)
    if reorder:
        D_1d = spdist.squareform(D, checks=0)
        K = sphier.linkage(D_1d, method=method)

        if isinstance(cutoff, float):
            par = {'height': cutoff}
        elif isinstance(cutoff, int):
            par = {'n_clusters': cutoff}

        dendo = sphier.dendrogram(
            K,
            no_plot=1,
            distance_sort=distance_sort,
        )
        od = dendo['leaves'][::-1]
        clu = sphier.cut_tree(K, **par).ravel()

        #### Sort cluster as dendogram shows
        mapper = {
            k: i
            for i, (k, _) in enumerate(pyutil.itertools.groupby(clu[od]))
        }
        clu = map(mapper.get, clu)
        #         od = np.argsort(clu)
        D = D[od][:, od]
    else:
        clu = np.arange(len(D))

    if not silent:
        if axs is None:
            fig, axs = plt.subplots(1, 3, figsize=[14, 4])

        axs = axs.ravel()
        i = -1

        i += 1
        plt.sca(axs[i])
        ax = axs[i]
        pyvis.histoLine(D.ravel(), xlim=vlim)
        plt.grid()
        plt.xlabel('distance')
        plt.ylabel('count')

        i += 1
        plt.sca(axs[i])
        ax = axs[i]
        if reorder:
            sphier.dendrogram(K,
                              ax=ax,
                              orientation='left',
                              distance_sort=distance_sort,
                              truncate_mode='level',
                              p=level,
                              no_labels=1)
            if isinstance(cutoff, float):
                plt.vlines(cutoff, *ax.get_ylim(), linestyles='--')

        i += 1
        plt.sca(axs[i])
        ax = axs[i]
        pyvis.heatmap(D, cname='distance', ax=ax, vlim=vlim)
        pyvis.hide_axis(ax)

    return clu, axs
Beispiel #8
0
    def _render(self,
                obj,
                axs=None,
                look='patch',
                figsize=[14, 6],
                tickMax=None,
                show_axa=None,
                show_axc=None,
                shortName=0,
                **kwargs):
        '''[CORE]: Contains rendering methods for different types of data
'''
        #         if tickMax
        tickMax = self.tickMax if tickMax is None else tickMax
        if axs is None:
            fig, axs = plt.subplots(1, 1, figsize=figsize, sharex='all')
            #             ax = axs
            ax = axs
        else:
            ax = axs[1]
        if show_axa is None:
            show_axa = self.show_axa
        if show_axc is None:
            show_axc = self.show_axc

        header_rotation = self.header_rotation
        assert isinstance(obj, scount.countMatrix)
        #         if show_axa:
        axa, axb, axc = axs
        axa.get_shared_y_axes().join(axa, axb)
        hide_axis(axa)
        cmap = plt.get_cmap()
        #         else:
        #             axb, axc = axs

        #         #### [TEMP]
        if not show_axa:
            pyvis.hide_frame(axa)
            #             pyvis.hideAxis(axa)
            pyvis.hide_ticks(axa)
        hide_frame(axc)
        hide_axis(axc)
        #         pyvis.hide_ticks(axc);

        vlim = obj.vlim
        #         xticks = np.linspace(0,len(obj),20+1)
        xlines = np.linspace(0, len(obj), 20 + 1)
        axb.set_xticks(xlines, minor=True)
        xticks = None
        xticklabs = None
        xlim = None

        if look == 'matrix':
            res = obj.heatmap(ax=axb, cname=None, vlim=obj.vlim)
            #             res = pyvis.heatmap(obj.values, vlim = vlim,
            #                                 ax=axb,cname = None)
            if show_axc:
                plt.colorbar(mappable=res, cax=axc)
                axcy = axc.get_yaxis()
                axcy.tick_right()
                axcy.set_visible(True)

        elif look in ['patch']:
            df = obj.fillna(0)
            res = pyvis.heatmap(
                df.values[None, :, :3].astype(float),
                ax=axb,
                cname=None,
                #                 vlim = [None,None],
                #                 vlim = [0,1]
            )

            dftick = getPatchTicks(df, tol=10)
            if not dftick.empty:
                xticks = dftick.M.values
                xticklabs = dftick.index.values
                #                 .astype(int)
                axb.get_xaxis().tick_top()

        elif look == 'line':
            if obj.shape[0] == 1:
                obj = obj.transpose()

            res = axb.plot(obj.values, 'x')
            if vlim is not None:
                axb.set_ylim(vlim)

        elif look == 'fill':
            if obj.shape[0] == 1:
                obj = obj.transpose()
#             print obj.index
            xs, ys = obj.index, obj.values.ravel()
            #             xs,ys = range(len(obj)),obj.values.ravel()
            res = axb.plot(xs, ys, '-')
            axb.fill_between(xs, 0, ys, interpolate=True)
            if vlim is not None:
                axb.set_ylim(vlim)
            xlim = pyutil.span(obj.index.values)

        elif look == 'tick':
            color = plt.rcParams['axes.prop_cycle'].by_key()['color'][0]
            res = axb.vlines(
                np.nonzero(obj.fillna(False).values),
                -1,
                1,
                colors=color,
                linewidth=0.5,
            )
        elif look == 'text':
            objc = obj.reset_index(drop=True).dropna()
            #             axb_ymid =  sum(axb.get_ylim())/2.
            axb_ymid = 0.
            axb.set_ylim(-1, 1)

            def plotter(row):
                i, val = row.name, row[0]
                res = axb.text(
                    i + 0.,
                    axb_ymid,
                    val,
                    horizontalalignment='center',
                    verticalalignment='center',
                    rotation='vertical',
                    size='large',
                    # #                          fontsize=1,
                    #                         transform=axb.transAxes
                )

            objc.apply(plotter, axis=1)

        elif look == 'gtf':
            feats = obj.to_feats()
            assert self.xlim is not None
            xlim = self.xlim
            axb.set_xlim(xlim)
            #             if self.debug:
            #                 print ('[feats]',feats)
            i = -1

            #             import pdb; pdb.set_trace()

            for i, feat in enumerate(feats):
                tlim, _, _ = pybio.add_transcript(
                    feat,
                    ax=axb,
                    adjust_xlim=0,
                    force=1,  #### TBC can be dangerous
                    intronHeight=0.75,
                    exonHeight=1.0,
                    ycent=i,
                    debug=self.debug,
                )
#                 if self.debug:
#                     print ('[tlim]',tlim)
            axb.set_ylim(-0.5, i + 0.5)

        elif look == 'feats':
            feats = obj.values.ravel()
            if isinstance(feats[0], Bio.SeqFeature.SeqFeature):
                feats = map(seqFeat2plotFeat, feats)
            for fi, feat in enumerate(feats):
                feat.draw_feature()
                map(axb.add_patch, feat.patches)
                feat.draw_feat_name()
                map(axb._add_text, feat.feat_name)

#             axb.set_xlim(0,10000)
            axb.set_ylim(-1., 1.)
#         xlim =
        xlim = self.get_xlim(xlim=self.xlim)
        print('[xlim]', xlim)
        axb.set_xlim(xlim)

        if self.debug:
            print('[xlim,ylim]', axb.get_xlim(), axb.get_ylim())
            print('[look]{look}\n[xticks]{xticks}\n[xticklabs]{xticklabs}'.
                  format(**locals()))

        ##### set vertical grid if not ticking
        if (look not in ['tick', 'gtf', 'fill', 'patch']) & (self.showGrid):
            axb.grid(color='black', axis='x', linestyle='--', which='minor')
        if xticks is not None:
            axb.set_xticks(xticks)
            axb.set_xticklabels(xticklabs)
#             axb.xaxis.set_ticks_position('bottom')
        else:
            hide_axis(axb, which='x')

#         if look not in ['fill']:
        if 1:
            axb.set_ylabel('')
            hide_axis(axb, which='y')
        else:
            pass

        if show_axa:
            ############################################
            #### Add row label within each track #######
            ############################################
            if look in ['fill', 'float']:
                colnames = axb.get_yticks()
                #             colnames = ['%.3f'%x for x in np.linspace(*axb.get_ylim())]
                for i, col in enumerate(colnames):
                    axa.text(
                        1.,
                        col,
                        str(col)[:10],
                        horizontalalignment='right',
                        verticalalignment='center',
                        clip_on=show_axa,
                    )

            else:
                colnames = obj.colName_short() if shortName else obj.columns
                if len(colnames) < tickMax or tickMax == -1:
                    #                     pass
                    #                 else:
                    for i, col in enumerate(colnames):
                        axa.text(
                            1.,
                            i,
                            str(col)[:25],
                            horizontalalignment='right',
                            verticalalignment='center',
                            clip_on=show_axa,
                        )

#             res = axb.text( i + 0. , axb_ymid,
#                          val,
#                         horizontalalignment='center',
#                         verticalalignment='center',
#                         rotation='vertical',
#                         size='large',
# # #                          fontsize=1,
# #                         transform=axb.transAxes
#                         )
#         colnames = obj.columns

### Add track name
        track_ax = axb if not show_axa else axa

        axb.yaxis.set_ticks([])
        if obj.name is not None and obj.name != 'None':
            trackName = pyutil.formatName(obj.name)
            if header_rotation == 'horizontal':
                #                 track_ax.text(-.0, sum(axa.get_ylim())/2.,
                track_ax.text(
                    -.0,
                    0.5,
                    trackName,
                    rotation='horizontal',
                    #                          horizontalalignment='center',
                    #                           verticalalignment='bottom',
                    transform=track_ax.transAxes,
                    horizontalalignment='right',
                    verticalalignment='center',
                )
            elif header_rotation == 'vertical':
                track_ax.set_ylabel(trackName)
            track_ax.yaxis.set_visible(True)
            track_ax.yaxis.set_ticks([])
        if self.debug:
            print('[axb.get_xticks()]%s' % axb.get_xticks())
#         hide_axis(axb,which = 'x')
#         pyvis.hide_ticks(axb)
#         pyvis.hi
        return axs
Beispiel #9
0
# In[ ]:

mi = 6
m = mym.GMMLRP_VIMAP(name='t%d' % mi, D=2, K=5).init_model()
m.fit(X=X)
plotModel(m, X)

# In[ ]:

# np.sum(np.exp(logP),axis=1)
# mym.tf.nn.log_softmax(logP,axis =1)
logP = m.predict_proba(X)
score = pyutil.logsumexp(logP, axis=1, keepdims=1)
# logPcond = logP - score
# logPcond =
pyvis.heatmap(logP.T)
pyutil.logsumexp(logP, axis=1, log=0)

# In[ ]:

# m.emission.
if __name__ == '__main__':
    import sys
    sys.exit(0)
    get_ipython().system(u'jupyter nbconvert test__gmm.ipynb --to python')

# In[ ]:

# %matplotlib inline
if 0:
    from __future__ import absolute_import
Beispiel #10
0
    def _render(self,
                obj,
                axs=None,
                look='patch',
                figsize=[14, 6],
                shortName=1,
                **kwargs):
        '''[CORE]: Contains rendering methods for different types of data
'''
        if axs is None:
            fig, axs = plt.subplots(1, 1, figsize=figsize, sharex='all')
            #             ax = axs
            ax = axs
        else:
            ax = axs[1]
        assert isinstance(obj, CountMatrix.countMatrix)
        axa, axb, axc = axs
        axa.get_shared_y_axes().join(axa, axb)

        hide_axis(axa)
        hide_frame(axc)
        hide_axis(axc)

        vlim = obj.vlim
        if look == 'matrix':
            res = obj.heatmap(ax=axb, cname=None)
            #             axs[0].fig
            plt.colorbar(mappable=res, cax=axc)
            axcy = axc.get_yaxis()
            axcy.tick_right()
            axcy.set_visible(True)
#             axb.set_ylabel('')
        elif look == 'patch':
            df = obj.fillna(0)
            df[3] = 1
            res = pyvis.heatmap(
                df.values[None, :, :3],
                ax=axb,
                cname=None,
                #                 vlim = [None,None],
                #                 vlim = [0,1]
            )
        elif look == 'line':
            res = axb.plot(obj.values, 'x')
            if vlim is not None:
                axb.set_ylim(vlim)

        axb.set_xlim(0 - 0.5, len(obj) - 0.5)
        axb.grid(color='black', axis='x', linestyle='--')
        axb.set_xticks(np.linspace(0, len(obj), 20 + 1))
        axb.set_ylabel('')
        hide_axis(axb)

        #### Add row label within each track
        colnames = obj.colName_short() if shortName else obj.columns
        #         colnames = obj.columns
        for i, col in enumerate(colnames):
            axa.text(1.,
                     i,
                     str(col)[:20],
                     horizontalalignment='right',
                     verticalalignment='center',
                     clip_on=True)

        ### Add track name
        trackName = pyutil.formatName(obj.name)
        axa.text(
            -.0,
            sum(axa.get_ylim()) / 2.,
            trackName,
            horizontalalignment='right',
            verticalalignment='center',
        )
        axa.yaxis.set_visible(True)
        axa.yaxis.set_ticks([])

        return axs
Beispiel #11
0
def qc_ModelDict(dd=None,
                 fname=None,
                 ali=None,
                 geneKey=None,
                 DIR=None,
                 clu=None,
                 cluMax=100,
                 vlim=None):
    if isinstance(geneKey, dict):
        geneKey = pd.Dataframe.from_dict(geneKey)
        geneKey[1] = geneKey.index
        geneKey.rename(columns={0: 'Bio Name', 1: 'Gene Name'})
    if dd is None:
        dd = countMatrix.from_npy(fname)
        ali = fname.rsplit('.', 1)[0]

    if dd.suc == 0:
        print '[WARN] this model is empty due to a failure %s' % dd['name']
        return
    if vlim is None:
        vlim = np.span(dd.train_data, p=99.9)
#         geneKey.rename({})
    sper = 0
    #     ali = NBNAME+'_h%d_'%75+
    if ali is not None:
        ali = ali.rsplit('/', 1)[-1]
    else:
        ali = dd.__dict__.get('name', 'test')
        if isinstance(ali, list):
            ali = ':'.join(ali)
    DIR = os.path.abspath(DIR or '.')

    #     os.system('mkdir -p %s/src'%DIR)
    #     os.system('mkdir -p %s'%DIR)
    print '[ALI]', DIR, '/', ali

    mdl, tX = dd.model, dd.train_data
    tXsd = stdNorm(tX)
    gRef, condName = dd.rowName, dd.colName_short()

    #### Process rowName
    gRef = pd.DataFrame({'Gene Name': gRef, 'Gene ID': gRef})
    if geneKey is not None:
        gRef = findMarker(gRef,
                          geneKey=geneKey,
                          silent=1,
                          how='left',
                          concise=1)
        gRef['isMarker'] = ~gRef['Bio Name'].isnull()
        gRef = gRef.rename(columns={'Query ID': 'Gene Name'}).drop('Hit ID', 1)
        print '[GREF]', len(gRef)

    if isinstance(mdl, list):
        print dd.nCol
        tX = tX[:, :dd.nCol[0]]
        nidx = np.isnan(tX[:, 0])
        mdl = mdl[0]
        if any(nidx):
            tX, nX = tX[~nidx], tX[nidx]
            nn = sum(nidx)
            Y = mdl.predict(tX)
            s = mdl.score_samples(tX)
            Y, pos = sortLabel(Y, tX)
            Y = np.hstack([Y, [max(Y) + 1] * nn])
            s = np.hstack([s, [-1] * nn])
            sbin = s > np.percentile(s, sper)
            print tX.shape, sbin.shape
        else:
            Y = mdl.predict(tX)
            s = mdl.score_samples(tX)
            sbin = s > np.percentile(s, sper)
            Y, pos = sortLabel(Y, tX)
        tX = dd.train_data
    else:
        Y = mdl.predict(tX)
        s = mdl.score_samples(tX)
        sbin = s > np.percentile(s, sper)
        Y, pos = sortLabel(Y, tX)
#     dd.setDF(tX)

# pcommon= {}
    try:
        os.system('mkdir -p %s/%s' % (DIR, ali))
        #         _ , ali = ali.rsplit('/',1)
        CWD = os.getcwd()
        _ = os.chdir('%s/%s' % (DIR, ali))
        os.system('mkdir -p src/')
        OFILE = open('main.md', 'w')
        ExcelFile = pd.ExcelWriter('main.xlsx', engine='xlsxwriter')
        with pyutil.RedirectStdStreams(OFILE):
            #             print dd.param
            parDF = dd.param if isinstance(dd.param, list) else [dd.param]
            parDF = pd.DataFrame(parDF)
            #             print '[pAss]'
            print '\n', pyutil.pd2md(parDF)
            #             for k,v in .items():
            #                 print '%s:%s\n'%(k,v)
            print 'Directory: %s \n \n  Model Name: %s' % (DIR, ali)
            print '\n [.xlsx](main.xlsx)',
            print '[.tar.gz](main.tar.gz)',
            for clu in range(-1, max(Y) + 1):
                if clu == cluMax:
                    break
                fig, axs = plt.subplots(2,
                                        1,
                                        figsize=[
                                            max(7, min(14,
                                                       len(tX) / 3.)),
                                            max(5, min(18,
                                                       len(tX.T) / 1.5))
                                        ],
                                        sharex='all',
                                        gridspec_kw={
                                            'bottom': 0.28,
                                            'top': 0.8,
                                            'left': 0.2
                                        })
                if clu == -1:
                    #                     Y,pos = sortLabel(Y,tX)
                    cidx = Y > -1
                    gCur = gRef[cidx]
                    sidx = np.argsort(Y)
                    gCur = gCur.iloc[sidx]
                    cluName = 'background.gene'
                else:
                    cidx = Y == clu
                    cidx = cidx & sbin
                    gCur = gRef[cidx]
                    sidx = np.argsort(s[cidx])[::-1]
                    gCur = gCur.iloc[sidx]
                    cluName = 'clu%03d.gene' % (clu)
                print '\n Cluster:%d' % (
                    clu), '\n', 'Gene Count:%d' % len(gCur)
                if len(gCur) == 0:
                    continue

#                 gCur = pd.DataFrame({'Gene Name':gCur})
#                 if clu>=0:
                if geneKey is not None:
                    gMark = gCur[gCur['isMarker'] == 1]
                    #                     gMark = findMarker(gCur['Gene Name'],geneKey=geneKey,silent=1,how='right')
                    gMark.to_excel(ExcelFile, cluName, index=True, startcol=2)
                    print '\n', pyutil.pd2md(gMark)
#                     gCur = gMark
#                     gCur = gMark.rename(columns={'Query ID':'Gene Name'})
                gCur[['Gene Name']].to_csv(
                    'src/%s' % cluName,
                    index=0,
                )
                #                 gCur[['Gene Name']].to_excel(ExcelFile,cluName,index=True,)
                dd.df.loc[gCur['Gene Name']].to_excel(
                    ExcelFile,
                    cluName,
                    index=True,
                )

                figList = []
                matDict = {'raw': tX, 'stdNorm': tXsd}
                sheet_curr = ExcelFile.sheets[cluName]
                for i, k in enumerate(['raw', 'stdNorm']):
                    C = matDict[k]
                    ax = axs[i]
                    im = pyvis.heatmap(
                        C[cidx][sidx],
                        ylab=(None if not i else 'Gene'),
                        ytick=(None if not i else gCur['Gene Name']),
                        xlab='Condition',
                        xtick=condName,
                        transpose=1,
                        vmin=vlim[0],
                        vmax=vlim[1],
                        ax=ax)
                    dd.addBox(ax=ax)
                    #                     figList +=[FFname]

                    plt.colorbar(im)
                    plt.title(k)
                plt.suptitle('Cluster %d' % clu, y=1)
                #                 try:
                #                 fig.tight_layout()
                #                 except:
                #                     print '\n \[WARN\] tight_layout() failed, legend may not display properly'
                #                     pass
                FFname = 'src/clu%03d.png' % (clu, )
                FigMd = pyutil.showsavefig(fname=FFname)
                print '\n', (FigMd)  ## remove directory name
                sheet_curr.insert_image(0, 7, FFname)
                plt.show()
                plt.close()

        ExcelFile.save()
        ExcelFile.close()
        os.system('pdext {fname} html'.format(fname=OFILE.name))
        s = '[{0}]({0}/)'.format(dd.name)
        if pyutil.hasIPD:
            pyutil.ipd.display(pyutil.ipd.Markdown(s))
        else:
            print s
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        raise e
    finally:
        os.chdir(CWD)
Beispiel #12
0
def qc_minfo(
    resA=None,
    resB=None,
    cluA=None,
    cluB=None,
    X=None,
    CUTOFF=30,
    xlab=None,
    ylab=None,
    maxLine=4,
    vlim=[-2, 2],
    silent=1,
    short=1,
):
    '''display log-bias matrix
'''

    if resA is not None:
        ##### Managing your index is crucialllllllllll!
        if X is not None:
            cluA = resA.model.predict_proba(X)
            cluB = resB.model.predict_proba(X)
        else:
            index = resA.index
            resA = resA.reindex(index)
            resB = resB.reindex(index)
            cluA = resA.model.predict_proba(resA.values)
            cluB = resB.model.predict_proba(resB.values)
#         N = len(index)
    else:
        assert cluA is not None
    N = len(cluA)
    cluA = np.log(cluA)
    cluB = np.log(cluB)
    #     axis = 1
    #     A = pyutil.get_logP(df = resA ,  axis = axis)
    #     B = pyutil.get_logP(df = resB ,  axis = axis)
    # A = prob2Onehot(A);B=prob2Onehot(B)
    # B = A
    # A = B

    logC = pyutil.proba_crosstab(
        cluA, cluB)  #### estimate joint distribution of labels
    margs = pyutil.get_marginal(logC)  #### calculate marginal
    entC = pyutil.wipeMarg(logC,
                           margs=margs)  #### wipe marginals from jointDist

    #     CUTOFF = 30
    MI = pyutil.entExpect(logC)
    # MI = np.sum(np.exp(logC)*entC)
    H1 = -pyutil.entExpect(margs[0])
    H2 = -pyutil.entExpect(margs[1])

    if not silent:
        print 'MI=', MI
        print 'H1=', H1
        print 'H2=', H2
        fig, axs = plt.subplots(1, 2, figsize=[14, 4])
        axs = axs.ravel()
        if resA is not None:
            xlab = resA.formatName(maxLine=maxLine) if xlab is None else xlab
        if resB is not None:
            ylab = resB.formatName(maxLine=maxLine) if ylab is None else ylab

        im = entC
        if CUTOFF is not None:
            xidx = np.where((np.exp(margs[0].ravel()) * N) > CUTOFF)[0]
            yidx = np.where((np.exp(margs[1].ravel()) * N) > CUTOFF)[0]
            im = im[xidx][:, yidx]

        pyvis.heatmap(logC, transpose=1, cname='log proba', ax=axs[0])
        pyvis.heatmap(im.T,
                      vlim=vlim,
                      cname='log likelihood ratio',
                      ax=axs[1],
                      xlab=xlab,
                      ylab=ylab,
                      ytick=yidx,
                      xtick=xidx)
    if short:
        return [MI, H1, H2]
    else:
        return [MI, H1, H2], [entC, logC, margs]