Example #1
0
    def moving_average(self, n, circular=False):
        """Compute moving average of the genome coverage

        :param n: window's size. Must be odd
        :param bool circular: is the chromosome circular or not

        Store the results in the :attr:`df` attribute (dataframe) with a
        column named *ma*.

        """
        N = len(self.df['cov'])
        assert n < N/2
        from sequana.stats import moving_average

        ret = np.cumsum(np.array(self.df["cov"]), dtype=float)
        ret[n:] = ret[n:] - ret[:-n]
        ma = ret[n - 1:] / n
        mid = int(n / 2)
        self.df["ma"] = pd.Series(ma, index=np.arange(start=mid,
            stop=(len(ma) + mid)))

        if circular:
            # FIXME: shift of +-1 as compared to non circular case...
            # shift the data and compute the moving average
            self.data = list(self.df['cov'].values[N-n:]) +\
                list(self.df['cov'].values) + \
                list(self.df['cov'].values[0:n])
            ma = moving_average(self.data, n)
            self.ma = ma[n//2+1:-n//2]
            self.df["ma"] = pd.Series(self.ma, index=self.df['cov'].index)
Example #2
0
    def to_kmer_content(self, k=7):
        """Return a Series with kmer count across all reads

        :param int k: (default to 7-mers)
        :return: Pandas Series with index as kmer and values as count.

        Takes about 30 seconds on a million reads.
        """
        # Counter is slow if we apply it on each read.
        # .count is slow as well
        import collections
        from sequana.kmer import get_kmer
        counter = collections.Counter()
        pb = Progress(len(self))
        buffer_ = []
        for i, this in enumerate(self):
            buffer_.extend(list(get_kmer(this['sequence'], k)))
            if len(buffer_) > 100000:
                counter += collections.Counter(buffer_)
                buffer_ = []
            pb.animate(i)
        counter += collections.Counter(buffer_)

        ts = pd.Series(counter)
        ts.sort_values(inplace=True, ascending=False)

        return ts
Example #3
0
def evenness(data):
    """Return Evenness of the coverage

    :Reference: Konrad Oexle, Journal of Human Genetics 2016, Evaulation
        of the evenness score in NGS.

    work before or after normalisation but lead to different results.

    .. math::

        C = mean(X)
        D2 = X[X<=C]
        N = len(X)
        n = len(D2)
        E = 1 - (n - sum(D2) / C) / N

    """
    coverage = pd.Series(data)

    coverage = coverage.dropna()

    C = float(round(coverage.mean()))
    D2 = coverage[coverage <= C]
    if len(D2) == 0:
        return 1
    else:

        return 1. - (len(D2) - sum(D2) / C) / len(coverage)
Example #4
0
    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Example #5
0
    def sirv(self):
        sirv = []
        shift = 5
        for df in self.rawdata:
            aa = df.query("reference_name not in [-1, '-1']").copy()
            if len(aa) == 0:
                sirv.append(pd.Series(index=self.SIRV_names))
                continue

            aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
            # filter quality and flags
            #mask = np.logical_or(df.flags & 256, df.flags & 2048)
            #aa = aa.query("mapq>[email protected]_min and @mask")

            data = aa.groupby("group").count()["mapq"]
            data.name = None
            sirv.append(data)
        return sirv
Example #6
0
    def plot_sequence_quality(self, max_score=40, ax=None):

        ymax = max_score + 1
        xmax = 0
        for sample in self.fastqc_data.keys():
            if "per_sequence_quality_scores" in self.fastqc_data[sample]:
                data = {
                    self._avg_bp_from_range(d['base']): d['mean']
                    for d in self.fastqc_data[sample]
                    ['per_base_sequence_quality']
                }
                df = pd.Series(data)
                df.plot(color="k", alpha=0.5)

                if df.max() > ymax:
                    ymax = df.max()
                if df.index.max() > xmax:
                    xmax = df.index.max()

        if ax:
            pylab.sca(ax)
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.4)
        pylab.fill_between([0, xmax], [20, 20], [30, 30],
                           color='orange',
                           alpha=0.4)
        pylab.fill_between([0, xmax], [30, 30], [ymax, ymax],
                           color='green',
                           alpha=0.4)

        X = range(1, xmax + 1)

        pylab.ylim([0, ymax])
        if xmax != 0:
            pylab.xlim([0, xmax])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Phred Score", fontsize=12)
        pylab.grid(axis='x')
Example #7
0
    def add_stats(self):
        df = pd.Series(self.summary['read_stats']).to_frame().T
        df.index = ['read length stats']
        table = DataTable(df, "table", index=True)
        table.datatable.datatable_options = {
            'scrollX': '300px',
            'pageLength': 15,
            'scrollCollapse': 'true',
            'dom': 't',
            "paging": "false",
            'buttons': ['copy', 'csv']
            }
        js = table.create_javascript_function()
        # IMPORTANT: here conversion to integer with %d
        # to round and make integer. !! The GC is therefore
        # converted to integer as well.
        html_tab = table.create_datatable(float_format='%d')
        html = "{} {}".format(html_tab, js)

        self.sections.append({
          "name": "Basic stats on read length",
          "anchor": "table",
          "content": html
        })
Example #8
0
    def plot_jaccard_distance(self, mode, padjs=[0.0001,0.001,0.01,0.05,0.1],
            Nfc=50, smooth=False, window=5):
        assert mode in ['down', 'up', 'all']
        pylab.clf()

        if mode == "down":
            m1 = self.r1.df.log2FoldChange.min()
            m2 = self.r2.df.log2FoldChange.min()
            minimum = min(m1,m2)
            print(m1, m2)
            X = pylab.linspace(0, minimum, Nfc)
        elif mode == "up":
            m1 = self.r1.df.log2FoldChange.max()
            m2 = self.r2.df.log2FoldChange.max()
            maximum = max(m1,m2)
            X = pylab.linspace(0, maximum, Nfc)
        else:
            minmax1 = self.r1.df.log2FoldChange.abs().max()
            minmax2 = self.r2.df.log2FoldChange.abs().max()
            maximum = max(minmax1, minmax2)
            X = pylab.linspace(0, maximum, Nfc)

        common = {}
        for padj in padjs:
            I = []
            common[padj] = []
            for x in X:
                if mode == "down":
                    # less than a given fold change that is negative
                    A = set(self.r1.df.query("log2FoldChange<=@x and padj<@padj").index)
                    B = set(self.r2.df.query("log2FoldChange<=@x and padj<@padj").index)
                elif mode == "up":
                    # greater than a given fold change that is positive
                    A = set(self.r1.df.query("log2FoldChange>=@x and padj<@padj").index)
                    B = set(self.r2.df.query("log2FoldChange>=@x and padj<@padj").index)
                else:
                    A = set(self.r1.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index)
                    B = set(self.r2.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index)
                if len(A) == 0 or len(B) == 0:
                    # no overlap yet
                    I.append(100)
                else:
                    res = len(A.intersection(B)) / (len(A) + len(B) - len(A.intersection(B)))  * 100
                    I.append(res)   
                common[padj].append(len(A.intersection(B)))

            try:
                if smooth:
                    I = pd.Series(I).rolling(window).median().values
                else:
                    assert False
            except:
                pass
            pylab.plot(X, I, 'o-', label=str(padj))
        ax = pylab.gca()
        ax.set_ylabel("Jaccard similarity (intersection/union)")
        ax.set_xlabel("Fold change (log2)")
        ax2 = ax.twinx()
        for padj in padjs:
            ax2.plot(X, common[padj], color='orange', ls='--')
        ax2.set_ylabel("Cardinality of the union ")
        ax.legend()
        ax.set_ylim([0,100])
        #ax2.set_ylim([0,100])
        if mode == "down":
            ax.axvline(-2, ls='--', color='r')
        else:
            ax.axvline(2, ls='--', color='r')