Ejemplo n.º 1
0
    def barplot_count_ORF_CDS_by_frame(self,
                                       alpha=0.5,
                                       bins=40,
                                       xlabel="Frame",
                                       ylabel="#",
                                       bar_width=0.35):
        if self._ORF_pos is None:
            self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames) - (bar_width / 2),
                  nb_res_ORF,
                  bar_width,
                  alpha=alpha,
                  label="ORF N = %d" % sum(nb_res_ORF))
        pylab.bar(np.array(frames) + (bar_width / 2),
                  nb_res_CDS,
                  bar_width,
                  alpha=alpha,
                  label="CDS N = %d" % sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Ejemplo n.º 2
0
    def pdf(self, x, params, normalise=True):
        """Expected parameters are


        params is a list of gaussian distribution ordered as mu, sigma, pi,
        mu2, sigma2, pi2, ...

        """
        assert divmod(len(params), 3)[1] == 0
        assert len(params) >= 3 * self.k
        k = len(params) / 3

        self.k = k

        pis = np.array(params[2::3])

        if any(np.array(pis) < 0):
            return 0
        if normalise is True:
            pis /= pis.sum()
        # !!! sum pi must equal 1 otherwise may diverge badly
        import scipy.stats as ss
        data = 0
        for i in range(0, int(k)):
            mu, sigma, pi_ = params[i * 3:(i + 1) * 3]
            pi_ = pis[i]
            if sigma != 0:
                data += pi_ * ss.norm.pdf(x, mu, sigma)
        return data
Ejemplo n.º 3
0
 def get_percentage_genes_covered_at_this_fraction(self, this):
     assert this <= 1 and this >= 0
     icol = self.coverage_column
     X = pylab.linspace(0, 1, 101)
     N = float(len(self.df))
     Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X])
     return np.interp(this, X, Y)
Ejemplo n.º 4
0
    def moving_average(self, n, circular=False):
        """Compute moving average of the genome coverage

        :param n: window's size. Must be odd
        :param bool circular: is the chromosome circular or not

        Store the results in the :attr:`df` attribute (dataframe) with a
        column named *ma*.

        """
        N = len(self.df['cov'])
        assert n < N/2
        from sequana.stats import moving_average

        ret = np.cumsum(np.array(self.df["cov"]), dtype=float)
        ret[n:] = ret[n:] - ret[:-n]
        ma = ret[n - 1:] / n
        mid = int(n / 2)
        self.df["ma"] = pd.Series(ma, index=np.arange(start=mid,
            stop=(len(ma) + mid)))

        if circular:
            # FIXME: shift of +-1 as compared to non circular case...
            # shift the data and compute the moving average
            self.data = list(self.df['cov'].values[N-n:]) +\
                list(self.df['cov'].values) + \
                list(self.df['cov'].values[0:n])
            ma = moving_average(self.data, n)
            self.ma = ma[n//2+1:-n//2]
            self.df["ma"] = pd.Series(self.ma, index=self.df['cov'].index)
Ejemplo n.º 5
0
 def plot_bar(self, spikes_filename=None, ratio=100):
     data = self.spikes_found(spikes_filename)
     lengths = [self.SIRV_lengths[x] for x in data.index]
     data.plot(kind="bar")
     pylab.plot(np.array(lengths) / ratio)
     pylab.tight_layout()
     return data
Ejemplo n.º 6
0
    def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, 
            hold=False, ax=None):


        quality = self.df[[str(x) for x in range(42)]]  # not sure why we have phred score from 0 to 41
        N = self.metadata['ReadNum']
        proba = quality / N

        self.xmax = 150
        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax) # pragma no cover
        pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3)
        pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3)
        pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3)


        X = []
        Q = []
        S = []
        for pos in range(1, 151):
            qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()]
            mean_quality = sum(qualities) / N
            X.append(pos)
            Q.append(mean_quality)
            proba = quality.loc[pos] / N

            std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)]))
            S.append(std)

        print(len(X))
        print(len(Q))
        print(len(S))

        Q = np.array(Q)
        X = np.array(X)
        S = np.array(S)
        pylab.fill_between(X, Q+S, Q-S, 
            color=color, interpolate=False)

        pylab.plot(X, Q, color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax+1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis='x')
Ejemplo n.º 7
0
    def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40,
        xlabel="Frame", ylabel="#", bar_width=0.35):
        if self._ORF_pos is None:
                self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF))
        pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Ejemplo n.º 8
0
    def get_required_coverage(self, M=0.01):
        """Return the required coverage to ensure the genome is covered

        A general question is what should be the coverage to make sure
        that e.g. E=99% of the genome is covered by at least a read.

        The answer is:

        .. math:: \log^{-1/(E-1)}

        This equation is correct but have a limitation due to floating precision. 
        If one provides E=0.99, the answer is 4.6 but we are limited to a
        maximum coverage of about 36 when one provides E=0.9999999999999999
        after which E is rounded to 1 on most computers. Besides, it is no
        convenient to enter all those numbers. A scientific notation would be better but
        requires to work with :math:`M=1-E` instead of :math:`E`.

        .. math:: \log^{-1/ - M}

        So instead of asking the question what is the
        requested fold coverage to have 99% of the genome covered, we ask the question what
        is the requested fold coverage to have 1% of the genome not covered.
        This allows us to use :math:`M` values as low as 1e-300 that is a fold coverage 
        as high as 690.


        :param float M: this is the fraction of the genome not covered by
            any reads (e.g. 0.01 for 1%). See note above.
        :return: the required fold coverage

        .. plot::

            import pylab
            from sequana import Coverage
            cover = Coverage()
            misses = np.array([1e-1, 1e-2, 1e-3, 1e-4,1e-5,1e-6])
            required_coverage = cover.get_required_coverage(misses)
            pylab.semilogx(misses, required_coverage, 'o-')
            pylab.ylabel("Required coverage", fontsize=16)
            pylab.xlabel("Uncovered genome", fontsize=16)
            pylab.grid()

        # The inverse equation is required fold coverage = [log(-1/(E - 1))]
        """
        # What should be the fold coverage to have 99% of the genome sequenced ?
        # It is the same question as equating 1-e^{-(NL/G}) == 0.99, we need NL/G = 4.6
        if isinstance(M, float) or isinstance(M, int):
            assert M < 1
            assert M >= 0
        else:
            M = np.array(M)
        # Here we do not use log(-1/(E-1)) but log(-1/(1-E-1)) to allow
        # for using float down to 1e-300 since 0.999999999999999 == 1
        return np.log(-1 / (-M))
Ejemplo n.º 9
0
    def plot(self,
             X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1],
             fontsize=16,
             label=None):
        """plot percentage of genes covered (y axis) as a function of percentage
        of genes covered at least by X percent (x-axis). 

        """
        icol = self.coverage_column
        N = float(len(self.df))
        X = np.array(X)
        Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X])
        if label is None:
            pylab.plot(X * 100, Y, "o-")
        else:
            pylab.plot(X * 100, Y, "o-", label=label)
        pylab.xlabel("Gene coverage (%)", fontsize=fontsize)
        pylab.ylabel("Percentage of genes covered", fontsize=fontsize)
        for this in [25, 50, 75]:
            pylab.axhline(this, color="r", alpha=0.5, ls="--")
            pylab.axvline(this, color="r", alpha=0.5, ls="--")
Ejemplo n.º 10
0
    def __init__(
        self,
        fold_changes=None,
        pvalues=None,
        color="auto",
        pvalue_threshold=0.05,
        fold_change_threshold=1,
    ):
        """.. rubric:: constructor


        :param list fold_changes: 1D array or list
        :param list pvalues: 1D array or list
           the threshold provided.
        :param pvalue_threshold: adds an horizontal dashed line at
        :param fold_change_threshold: colors in grey the absolute fold
            changes below a given threshold.
        """

        # try to compute the FC now
        # if self.fold_change is None:
        #    self.fold_change = pylab.log2(X1/X0)

        # if pvalue is None:
        #    # assume a normal distribution mean 0 and sigma 1
        #    import scipy.stats
        #    self.pvalue = - pylab.log10(scipy.stats.norm.pdf(abs(self.fold_change), 0,1)),

        self.fold_changes = np.array(fold_changes)
        self.pvalues = np.array(pvalues)
        self.color = color
        self.pvalue_threshold = pvalue_threshold
        self.fold_change_threshold = fold_change_threshold
        assert len(self.fold_changes) == len(self.pvalues)

        self.df = pd.DataFrame({
            "fold_change": self.fold_changes,
            "pvalue": self.pvalues
        })
        self._get_colors()
Ejemplo n.º 11
0
    def plot_bar_grouped(self, normalise=False, ncol=2, N=None):
        """

        :param normalise:
        :param ncol: columns in the legend

        """
        if N is not None:
            N = np.array(N)
        else:
            N = np.array([len(x) for x in self.rawdata])

        dd = pd.DataFrame(self.sirv).T
        if normalise:
            dd = dd / (N / max(N))
        dd.columns = self.labels

        dd.plot(kind="bar")
        pylab.xlabel("")
        pylab.legend(self.labels, ncol=ncol)
        pylab.tight_layout()
        return dd
Ejemplo n.º 12
0
    def run(self):
        ## 10% of the time in self.get_data and 90 in cor()
        if self.df is None:
            print("call read_align() method to read alignement file")
            return
        m = int(self.start / self.binning)
        M = int(self.stop / self.binning)
        results = {}
        # because bins is set to 5, we actually go from m*5 to M*5
        X = range(m, M + 1, 1)
        Xreal = np.array(
            range(m * self.binning, (M + 1) * self.binning, self.binning))

        for chrom in self.chromosomes:
            #logger.info("Processing {}".format(chrom))
            data = self.get_data(chrom)
            L = len(data)
            self.scc(data)

            # shift correlation
            Y = [self.cor(x) for x in X]
            results[chrom] = {'data_length': L, 'Y': np.array(Y), 'X': Xreal}

        # weighted average usng orginal length of the chrmosomes
        weights = np.array(
            [results[x]['data_length'] for x in self.chromosomes])
        weights = weights / sum(weights)

        self.results = results
        self.weights = weights
        # now the weighted cross correlation
        df_avc = pd.DataFrame(
            [w * results[x]['Y'] for w, x in zip(weights, self.chromosomes)])
        df_avc = df_avc.T
        df_avc.index = Xreal
        return results, df_avc
Ejemplo n.º 13
0
    def __init__(self, data, k=2, method='Nelder-Mead'):
        """.. rubric:: constructor

        :param list data:
        :param int k: number of GMM to use
        :param str method: minimization method to be used (one of scipy optimise module)


        """
        self.data = np.array(data)
        self.size = float(len(self.data))
        self._k = k
        self._model = None
        # initialise the model
        self.k = k
        self.verbose = True
Ejemplo n.º 14
0
    def plot_contig_length_vs_nreads(self, fontsize=16):
        # same as plot_scatter_contig_length_nread_cov
        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()
        pylab.loglog(df.length, df.nread, "o")
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig N reads", fontsize=fontsize)
        pylab.grid()

        X = df.query("nread>10 and length>100000")['length']
        Y = df.query("nread>10 and length>100000")['nread']
        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])
        pylab.plot(x, m * x + c, "o-r")
        pylab.tight_layout()
Ejemplo n.º 15
0
    def generalized_anscombe(x, mu, sigma, gain=1.0):
        """Compute the generalized anscombe variance stabilizing transform

        Data should be a mixture of poisson and gaussian noise.

        The input signal  z  is assumed to follow the Poisson-Gaussian noise
        model::
    
            x = gain * p + n

        where gain is the camera gain and mu and sigma are the read noise
        mean and standard deviation. X should contain only positive values.  
        Negative values are ignored. Biased for low counts
        """
        try:
            # If a dataframe, we do not want to change it
            y = gain * x + (gain**2) * 3.0 / 8.0 + sigma**2 - gain * mu
            return (2.0 / gain) * np.sqrt(np.maximum(y, 0.0))
        except:
            x = np.array(x)
            y = gain * x + (gain**2) * 3.0 / 8.0 + sigma**2 - gain * mu
            return (2.0 / gain) * np.sqrt(np.maximum(y, 0.0))
Ejemplo n.º 16
0
    def plot_scatter_contig_length_nread_cov(self,
                                             fontsize=16,
                                             vmin=0,
                                             vmax=50,
                                             min_nreads=20,
                                             min_length=50000):

        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()

        # least square
        X = df.query("nread>@min_nreads and length>@min_length")['length']
        Y = df.query("nread>@min_nreads and length>@min_length")['nread']
        Z = df.query("nread>@min_nreads and length>@min_length")['covStat']
        print(X)
        print(Y)
        print(Z)

        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])

        X = df['length']
        Y = df['nread']
        Z = df['covStat']
        pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax)
        pylab.colorbar()
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig reads", fontsize=fontsize)
        pylab.title("coverage function of contig length and reads used")
        pylab.grid()
        pylab.plot(x, m * x + c, "o-r")
        pylab.loglog()
        pylab.tight_layout()
Ejemplo n.º 17
0
    def get_data(self, chrname, remove_anomalies=True):

        # Could be done once for all in read_alignment
        df = self.df.query('ref==@chrname')

        # first the fragment position, shifting - strand by fragment length
        data = np.array([
            x if z == '+' else -y
            for x, y, z in zip(df['start'], df['end'], df['strand'])
        ])

        # sort by absolute position
        res = pd.DataFrame(data)
        res.columns = ['x']
        res['abs'] = res['x'].abs()
        res = res.sort_values('abs')
        del res['abs']

        if remove_anomalies:
            mask = self.remove_anomalies(res)
            res = res[mask]

        return res
Ejemplo n.º 18
0
    def anscombe(x):
        r"""Compute the anscombe variance stabilizing transform.

        :param x: noisy Poisson-distributed data
        :return: data with variance approximately equal to 1.

        Reference: Anscombe, F. J. (1948), "The transformation of Poisson,
            binomial and negative-binomial data", Biometrika 35 (3-4): 246-254

        For Poisson distribution, the mean and variance are not independent. The
        anscombe transform aims at transforming the data so that the variance
        is about 1 for large enough mean; For mean zero, the varaince is still
        zero. So, it transform Poisson data to approximately Gaussian data with
        mean :math:`\sqrt{x+3/8} - 1/(4m^{1/2})`
        """

        #if np.mean(x) <4:
        #    logger.warning("Mean of input data below 4")
        try:
            # If a dataframe, we do not want to change it
            return 2.0 * np.sqrt(x + 3.0 / 8.0)
        except:
            return 2.0 * np.sqrt(np.array(x) + 3.0 / 8.0)
    colors = [cmap(i) for i in np.linspace(0, 1, len(list_analysis))]

# get results for curves

pylab.figure(figsize=(8, 8))
for i in range(len(list_analysis)):
    analysis = list_analysis[i]
    res = compute_table_performance(analysis, df_results)
    print("%s" % analysis)
    # [TP, FP, FN, TN]
    # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]]))
    TP = res[0]
    FP = res[1]
    FN = [0] * res[2]
    TN = [0] * res[3]
    y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) +
                      [0] * len(TN))
    y_scores = np.array(TP + FN + FP + TN)
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    pylab.plot(recall, precision, color=colors[i], label=analysis)

pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.ylim([0.0, 1.05])
pylab.xlim([0.0, 1.05])
pylab.title('Precision-Recall')
#pylab.legend(loc="lower left")

lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#pylab.tight_layout()

if file_fig != "show":
Ejemplo n.º 20
0
 def _set_y(self, Y):
     self._Ytarget = np.array(Y)
Ejemplo n.º 21
0
 def _set_x(self, X):
     self._Xtarget = np.array(X)
     self.lower_bound = min(self._Xtarget)
     self.upper_bound = max(self._Xtarget)
Ejemplo n.º 22
0
    def _add_patches(self, df, method, fill, ax, diagonal=True):
        width, height = df.shape
        labels = (df.columns)

        patches = []
        colors = []
        for x in range(width):
            for y in range(height):
                if fill == 'lower' and x > y:
                    continue
                elif fill == 'upper' and x < y:
                    continue
                if diagonal is False and x == y:
                    continue

                datum = (df.iloc[x, y] + 1.) / 2.
                d = df.iloc[x, y]
                d_abs = np.abs(d)
                #c = self.pvalues[x, y]
                rotate = -45 if d > 0 else +45
                #cmap = self.poscm if d >= 0 else self.negcm
                if method in ['ellipse', 'square', 'rectangle', 'color']:
                    if method == 'ellipse':
                        func = Ellipse
                        patch = func(
                            (x, y),
                            width=1 * self.shrink,
                            height=(self.shrink - d_abs * self.shrink),
                            angle=rotate)
                    else:
                        func = Rectangle
                        w = h = d_abs * self.shrink
                        #FIXME shring must be <=1
                        offset = (1 - w) / 2.
                        if method == 'color':
                            w = 1
                            h = 1
                            offset = 0
                        patch = func((x + offset - .5, y + offset - .5),
                                     width=w,
                                     height=h,
                                     angle=0)
                    if self.edgecolor:
                        patch.set_edgecolor(self.edgecolor)
                    #patch.set_facecolor(cmap(d_abs))
                    colors.append(datum)
                    if d_abs > 0.05:
                        patch.set_linestyle('dotted')
                    #ax.add_artist(patch)
                    patches.append(patch)
                    #FIXME edgecolor is always printed
                elif method == 'circle':
                    patch = Circle((x, y), radius=d_abs * self.shrink / 2.)
                    if self.edgecolor:
                        patch.set_edgecolor(self.edgecolor)
                    #patch.set_facecolor(cmap(d_abs))
                    colors.append(datum)
                    if d_abs > 0.05:
                        patch.set_linestyle('dotted')
                    #ax.add_artist(patch)
                    patches.append(patch)
                elif method in ['number', 'text']:
                    if d < 0:
                        edgecolor = self.cm(-1.0)
                    elif d >= 0:
                        edgecolor = self.cm(1.0)
                    d_str = "{:.2f}".format(d).replace("0.",
                                                       ".").replace(".00", "")
                    ax.text(x,
                            y,
                            d_str,
                            color=edgecolor,
                            fontsize=self.fontsize,
                            horizontalalignment='center',
                            weight='bold',
                            alpha=max(0.5, d_abs))
                    # withdash=False)
                elif method == 'pie':
                    S = 360 * d_abs
                    patch = [
                        Wedge((x, y), 1 * self.shrink / 2., -90, S - 90),
                        Wedge((x, y), 1 * self.shrink / 2., S - 90, 360 - 90),
                    ]
                    #patch[0].set_facecolor(cmap(d_abs))
                    #patch[1].set_facecolor('white')
                    colors.append(datum)
                    colors.append(0.5)
                    if self.edgecolor:
                        patch[0].set_edgecolor(self.edgecolor)
                        patch[1].set_edgecolor(self.edgecolor)

                    #ax.add_artist(patch[0])
                    #ax.add_artist(patch[1])
                    patches.append(patch[0])
                    patches.append(patch[1])
                else:
                    raise ValueError(
                        'Method for the symbols is not known. Use e.g, square, circle'
                    )

        if self.binarise_color:
            colors = [1 if color > 0.5 else -1 for color in colors]

        if len(patches):
            col1 = PatchCollection(patches,
                                   array=np.array(colors),
                                   cmap=self.cm)
            ax.add_collection(col1)

            self.collection = col1
            # Somehow a release of matplotlib prevent the edge color
            # from working but the set_edgecolor on the collection itself does
            # work...
            if self.edgecolor:
                self.collection.set_edgecolor(self.edgecolor)
Ejemplo n.º 23
0
    def estimate(self,
                 guess=None,
                 k=None,
                 maxfev=2e4,
                 maxiter=1e3,
                 bounds=None):
        """guess is a list of parameters as expected by the model


        guess = {'mus':[1,2], 'sigmas': [0.5, 0.5], 'pis': [0.3, 0.7]  }

        """
        if k is not None:
            self.k = k

        if guess is None:
            # estimate the mu/sigma/pis from the data
            guess = self.get_guess()

        from scipy.optimize import minimize
        res = minimize(self.model.log_likelihood,
                       x0=guess,
                       args=(self.data, ),
                       method=self.method,
                       options=dict(maxiter=maxiter, maxfev=maxfev),
                       bounds=bounds)

        self.results = res
        pis = np.array(self.results.x[2::3])
        self.results.pis_raw = pis.copy()
        # The ratio may be negative, in which case we need to normalise.
        # An example would be to have -0.35, -0.15, which normalise would five 0.7, 0.3 as expected.
        """if sum(pis<0) > 0:
            unstable = True
            pis /= pis.sum()
            if self.verbose:
                print("Unstable... found negative pis (k=%s)" % self.k)
        else:
            unstable = False
            pis /= pis.sum()
        """
        unstable = False
        k = len(self.results.x) / 3
        params = []
        for i in range(0, int(k)):
            params.append(self.results.x[i * 3])
            params.append(self.results.x[(i * 3 + 1)])
            params.append(pis[i])
        self.results.x = params

        # FIXME shall we multiply by -1 ??
        self.results.log_likelihood = self.model.log_likelihood(
            params, self.data)
        if self.results.log_likelihood and unstable is False:
            self.results.AIC = criteria.AIC(self.results.log_likelihood,
                                            self.k,
                                            logL=True)
            self.results.AICc = criteria.AICc(self.results.log_likelihood,
                                              self.k,
                                              self.data.size,
                                              logL=True)
            self.results.BIC = criteria.BIC(self.results.log_likelihood,
                                            self.k,
                                            self.data.size,
                                            logL=True)
        else:
            self.results.AIC = 1000
            self.results.AICc = 1000
            self.results.BIC = 1000

        pis = np.array(self.results.x[2::3])

        self.results.pis = list(pis / pis.sum())
        self.results.sigmas = self.results.x[1::3]
        self.results.mus = self.results.x[0::3]

        return res
Ejemplo n.º 24
0
    def estimate(self, guess=None, k=2):
        """

        :param list guess: a list to provide the initial guess. Order is mu1, sigma1,
            pi1, mu2, ...
        :param int k: number of models to be used.
        """
        #print("EM estimation")
        self.k = k
        # Initial guess of parameters and initializations
        if guess is None:
            # estimate the mu/sigma/pis from the data
            guess = self.get_guess()

        mu = np.array(guess[0::3])
        sig = np.array(guess[1::3])
        pi_ = np.array(guess[2::3])
        N_ = len(pi_)

        gamma = np.zeros((N_, int(self.size)))
        N_ = np.zeros(N_)
        p_new = guess

        # EM loop
        counter = 0
        converged = False

        self.mus = []

        import scipy.stats as ss
        while not converged:
            # Compute the responsibility func. and new parameters
            for k in range(0, self.k):
                # unstable if eslf.model.pdf is made of zeros

                #self.model.pdf(self.data, p_new,normalise=False).sum()!=0:
                gamma[k, :] = pi_[k] * ss.norm.pdf(self.data, mu[k], sig[k])
                gamma[k, :] /= (self.model.pdf(self.data,
                                               p_new,
                                               normalise=False))
                """else:
                    gamma[k, :] = pi_[k]*pylab.normpdf(self.data, mu[k],
                        sig[k])/(self.model.pdf(self.data, p_new,
                            normalise=False)+1e-6)
                """
                N_[k] = gamma[k].sum()
                mu[k] = np.sum(gamma[k] * self.data) / N_[k]
                sig[k] = pylab.sqrt(
                    np.sum(gamma[k] * (self.data - mu[k])**2) / N_[k])
                pi_[k] = N_[k] / self.size

            self.results = {'x': p_new, 'nfev': counter, 'success': converged}

            p_new = []
            for this in range(self.k):
                p_new.extend([mu[this], sig[this], pi_[this]])

            #p_new = [(mu[x], sig[x], pi_[x]) for x in range(0, self.k)]
            #p_new = list(pylab.flatten(p_new))

            self.status = True
            try:
                assert abs(N_.sum() - self.size) / self.size < 1e-6
                assert abs(pi_.sum() - 1) < 1e-6
            except:
                print("issue arised at iteration %s" % counter)
                self.debug = {'N': N_, 'pis': pi_}
                self.status = False
                break

            self.mus.append(mu)

            # Convergence check
            counter += 1
            converged = counter >= self.max_iter

        self.gamma = gamma

        if self.status is True:
            self.results = {'x': p_new, 'nfev': counter, 'success': converged}

        self.results = AttrDict(**self.results)
        self.results.mus = self.results.x[0::3]
        self.results.sigmas = self.results.x[1::3]
        self.results.pis = self.results.x[2::3]

        log_likelihood = self.model.log_likelihood(self.results.x, self.data)
        self.results.AIC = criteria.AIC(log_likelihood, k, logL=True)

        self.results.log_likelihood = log_likelihood
        self.results.AIC = criteria.AIC(log_likelihood, self.k, logL=True)
        self.results.AICc = criteria.AICc(log_likelihood,
                                          self.k,
                                          self.data.size,
                                          logL=True)
        self.results.BIC = criteria.BIC(log_likelihood,
                                        self.k,
                                        self.data.size,
                                        logL=True)
Ejemplo n.º 25
0
    def get_df_concordance(self, max_align=-1):
        """This methods returns a dataframe with Insert, Deletion, Match,
        Substitution, read length, concordance (see below for a definition)


        Be aware that the SAM or BAM file must be created using minimap2 and the
        --cs option to store the CIGAR in a new CS format, which also contains
        the information about substitution. Other mapper are also handled (e.g.
        bwa) but the substitution are solely based on the NM tag if it exists.

        alignment that have no CS tag or CIGAR are ignored.


        """
        from sequana import Cigar
        count = 0
        I, D, M, L, mapq, flags, NM = [], [], [], [], [], [], []
        S = []
        for i, a in enumerate(self._data):
            # tags and cigar populated  if there is a match
            # if we use --cs cigar is not populated so we can only look at tags
            # tags can be an empty list
            if a.tags is None or len(a.tags) == 0:
                continue
            count += 1
            mapq.append(a.mapq)
            L.append(a.qlen)
            try:
                NM.append([x[1] for x in a.tags if x[0] == "NM"][0])
            except:
                NM.append(-1)

            flags.append(a.flag)

            if 'cs' in dict(a.tags):
                cs = CS(dict(a.tags)['cs'])
                S.append(cs['S'])
                I.append(cs['I'])
                D.append(cs['D'])
                M.append(cs['M'])
            elif a.cigarstring:
                cigar = Cigar(a.cigarstring).as_dict()
                I.append(cigar["I"])
                D.append(cigar['D'])
                M.append(cigar['M'])
                S.append(None)  # no info about substitutions in the cigar
            else:
                I.append(0)
                D.append(0)
                M.append(0)
                S.append(0)

            if max_align>0 and count == max_align:
                break

            if count % 10000 == 0:
                logger.debug("Read {} alignments".format(count))

        I = np.array(I)
        D = np.array(D)
        M = np.array(M)
        NM = np.array(NM)

        try:
            S = np.array(S)
            C = 1 - (I + D + S)/(S + I + D + M)
            logger.info("computed Concordance based on minimap2 --cs option")
        except:
            logger.info("computed Concordance based on standard CIGAR information using INDEL and NM tag")
            computed_S = NM - D - I
            C = 1 - (I + D + computed_S)/(computed_S + I + D + M)

        df = pd.DataFrame([C, L, I, D, M, mapq, flags, NM, S])
        df = df.T
        df.columns = ["concordance", 'length', "I", "D", "M", "mapq", "flags", "NM", "mismatch"]
        return df
Ejemplo n.º 26
0
    def plot(
            self,
            num=1,
            cmap=None,
            colorbar=True,
            vmin=None,
            vmax=None,
            colorbar_position="right",
            gradient_span="None",
            figsize=(12, 8),
            fontsize=None,
    ):
        """

        Using as input::

            df = pd.DataFrame({'A':[1,0,1,1],
                               'B':[.9,0.1,.6,1],
                            'C':[.5,.2,0,1],
                            'D':[.5,.2,0,1]})

        we can plot the heatmap + dendogram as follows::

            h = Heatmap(df)
            h.plot(vmin=0, vmax=1.1)


        .. plot::
            :include-source:
            :width: 80%

            from sequana.viz import heatmap
            df = heatmap.get_heatmap_df()
            h = heatmap.Heatmap(df)
            h.category_column['A'] = 1
            h.category_column['C'] = 1
            h.category_column['D'] = 2
            h.category_column['B'] = 2
            h.plot()


        """
        # save all parameters in a dict
        layout = {}

        if cmap is None:
            cmap = self.params.cmap
        try:
            import colormap

            cmap = colormap.cmap_builder(cmap)
        except:
            pass

        # keep track of row and column names for later.
        row_header = self.frame.index
        column_header = self.frame.columns

        import matplotlib

        # FIXME something clever for the fontsize
        if len(row_header) > 100 or len(column_header) > 100:
            matplotlib.rcParams["font.size"] = 6
        if len(row_header) > 50 or len(column_header) > 50:
            matplotlib.rcParams["font.size"] = 7
        if len(row_header) > 30 or len(column_header) > 30:
            matplotlib.rcParams["font.size"] = 8
        else:
            matplotlib.rcParams["font.size"] = 12
        if fontsize:
            matplotlib.rcParams["font.size"] = fontsize

        # scaling min/max range
        self.gradient_span = gradient_span  #'only_max'
        # min_to_max, min_to_max_centered, only_max, only_min

        if self.gradient_span == "min_to_max_centered":
            vmax = self.frame.max().max()
            vmin = self.frame.min().min()
            vmax = max([vmax, abs(vmin)])
            vmin = vmax * -1
        if self.gradient_span == "only_max":
            vmin = 0
            vmax = self.frame.max().max()
        if self.gradient_span == "only_min":
            vmin = self.frame.min().min()
            vmax = 0
        norm = matplotlib.colors.Normalize(vmin, vmax)

        # Scale the figure window size #
        fig = pylab.figure(num=num, figsize=figsize)
        fig.clf()

        # LAYOUT --------------------------------------------------
        # ax1 (dendrogram 1) on the left of the heatmap
        [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05, 0.22, 0.2, 0.6]
        width_between_ax1_axr = 0.004
        # distance between the top color bar axis and the matrix
        height_between_ax1_axc = 0.004
        # Sufficient size to show
        color_bar_w = 0.015

        # axr, placement of row side colorbar
        # second to last controls the width of the side color bar - 0.015 when showing
        [axr_x, axr_y, axr_w, axr_h] = [0.31, 0.1, color_bar_w, 0.6]
        axr_x = ax1_x + ax1_w + width_between_ax1_axr
        axr_y = ax1_y
        axr_h = ax1_h
        width_between_axr_axm = 0.004

        # axc, placement of column side colorbar #
        # last one controls the hight of the top color bar - 0.015 when showing
        [axc_x, axc_y, axc_w, axc_h] = [0.4, 0.63, 0.5, color_bar_w]
        axc_x = axr_x + axr_w + width_between_axr_axm
        axc_y = ax1_y + ax1_h + height_between_ax1_axc
        height_between_axc_ax2 = 0.004

        # axm, placement of heatmap for the data matrix # why larger than 1?
        [axm_x, axm_y, axm_w, axm_h] = [0.4, 0.9, 2.5, 0.5]
        axm_x = axr_x + axr_w + width_between_axr_axm
        axm_y = ax1_y
        axm_h = ax1_h
        axm_w = axc_w

        # ax2 (dendrogram 2), on the top of the heatmap #
        [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3, 0.72, 0.6, 0.15]
        ax2_x = axr_x + axr_w + width_between_axr_axm
        ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
        ax2_w = axc_w

        # axcb - placement of the color legend #
        if colorbar_position == "top left":
            [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07, 0.88, 0.18, 0.09]
        elif colorbar_position == "right":
            [axcb_x, axcb_y, axcb_w, axcb_h] = [0.85, 0.2, 0.08, 0.6]
        else:
            raise ValueError("'top left' or 'right' accepted for now")

        # COMPUTATION DENDOGRAM 1 -------------------------------------
        if self.column_method:
            Y = self.linkage(self.frame.transpose(), self.column_method,
                             self.column_metric)
            ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True)

            #     p=30,    truncate_mode=None,    color_threshold=None,    get_leaves=True,
            # orientation='top    labels=None,    count_sort=False,    distance_sort=False,
            #     show_leaf_counts=True,    no_plot=False,    no_labels=False,    leaf_font_size=None,
            #     leaf_rotation=None,    leaf_label_func=None,    show_contracted=False,
            #     link_color_func=None,    ax=None,    above_threshold_color='b',            #

            # color_threshold=0 and above_threshold_color='k' colors all
            # dendogram into black
            Z = hierarchy.dendrogram(
                Y,
                color_threshold=0,
                above_threshold_color="k",
                distance_sort="descending",
            )
            ind2 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]),
                                      self.cluster_criterion)

            ax2.set_xticks([])
            ax2.set_yticks([])
            # apply the clustering for the array-dendrograms to the actual matrix data
            idx2 = Z["leaves"]
            self.frame = self.frame.iloc[:, idx2]
            # reorder the flat cluster to match the order of the leaves the dendrogram
            ind2 = ind2[idx2]
            layout["dendogram2"] = ax2
        else:
            idx2 = range(self.frame.shape[1])

        # COMPUTATION DENDOGRAM 2 ---------------------------------
        if self.row_method:
            Y = self.linkage(self.frame, self.row_method, self.row_metric)

            ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True)
            Z = hierarchy.dendrogram(
                Y,
                orientation="right",
                color_threshold=0,
                above_threshold_color="k",
                distance_sort="descending",
            )
            ind1 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]),
                                      self.cluster_criterion)

            ax1.set_xticks([])
            ax1.set_yticks([])
            # apply the clustering for the array-dendrograms to the actual matrix data
            idx1 = Z["leaves"]
            self.frame = self.frame.iloc[idx1, :]
            # reorder the flat cluster to match the order of the leaves the dendrogram
            ind1 = ind1[idx1]
            layout["dendogram1"] = ax1
        else:
            idx1 = range(self.frame.shape[0])

        # HEATMAP itself
        axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])
        axm.imshow(
            self.frame,
            aspect="auto",
            origin="lower",
            interpolation="None",
            cmap=cmap,
            norm=norm,
        )
        axm.set_xticks([])
        axm.set_yticks([])
        layout["heatmap"] = axm

        # TEXT
        new_row_header = []
        new_column_header = []
        for i in range(self.frame.shape[0]):
            axm.text(
                self.frame.shape[1] - 0.5,
                i,
                "  " + str(row_header[idx1[i]]),
                verticalalignment="center",
            )
            new_row_header.append(
                row_header[idx1[i]] if self.row_method else row_header[i])

        for i in range(self.frame.shape[1]):
            axm.text(
                i,
                -0.9,
                " " + str(column_header[idx2[i]]),
                rotation=90,
                verticalalignment="top",
                horizontalalignment="center",
            )
            new_column_header.append(column_header[idx2[i]] if self.
                                     column_method else column_header[i])

        # CATEGORY column ------------------------------
        if self.category_column:
            axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])

            category_col = [
                self.category_column[self.df.columns[i]] for i in idx2
            ]

            dc = np.array(category_col, dtype=int)
            dc.shape = (1, len(ind2))
            cmap_c = matplotlib.colors.ListedColormap(
                self.params.col_side_colors)
            axc.matshow(dc, aspect="auto", origin="lower", cmap=cmap_c)
            axc.set_xticks([])
            axc.set_yticks([])
            layout["category_column"] = axc

        # CATEGORY row -------------------------------
        if self.category_row:
            axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h])
            # self.category_row must be a dictionary with names as found in the columns
            # of the dataframe.

            category_row = [self.category_row[self.df.index[i]] for i in idx1]

            dr = np.array(category_row, dtype=int)
            dr.shape = (len(category_row), 1)
            cmap_r = matplotlib.colors.ListedColormap(
                self.params.col_side_colors)
            axr.matshow(dr, aspect="auto", origin="lower", cmap=cmap_r)
            axr.set_xticks([])
            axr.set_yticks([])
            layout["category_row"] = axr

        # COLORBAR ----------------------
        if colorbar == True:
            axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h],
                                frame_on=False)
            if colorbar_position == "right":
                orientation = "vertical"
            else:
                orientation = "horizontal"
            cb = matplotlib.colorbar.ColorbarBase(ax=axcb,
                                                  cmap=cmap,
                                                  norm=norm,
                                                  orientation=orientation)
            # axcb.set_title("whatever")
            # max_cb_ticks = 5
            # axcb.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(max_cb_ticks))
            layout["colorbar"] = cb
            layout["colorbar_scalablemap"] = axcb

        #   could be useful
        self.d = {"ordered": self.frame.copy(), "rorder": idx1, "corder": idx2}

        return layout
Ejemplo n.º 27
0
 def _set_x(self, X):
     self._Xtarget = np.array(X)
     self.lower_bound = min(self._Xtarget)
     self.upper_bound = max(self._Xtarget)
Ejemplo n.º 28
0
 def _set_y(self, Y):
     self._Ytarget = np.array(Y)