Example #1
0
    def hist_concordance(self,  bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls.
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance()
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
Example #2
0
    def hist_concordance(self, method, bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls. 
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance(method)
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
Example #3
0
    def get_stats(self, output="json"):
        """Return basic stats about the coverage data"""
        data = self.df

        stats = {
            'DOC': self.df['cov'].mean(),
            'STD': self.df['cov'].std(),
            'Median': self.df['cov'].median(),
            'BOC': 100 * sum(self.df['cov'] > 0) / float(len(self.df))}
        try:
            stats['CV'] = stats['STD'] / stats['DOC']
        except:
            stats['CV'] = np.nan
        stats['MAD'] = np.median(abs(data['cov'].median() -
                                 data['cov']).dropna())

        names = ['BOC', 'CV', 'DOC', 'MAD', 'Median', 'STD']
        descriptions = [
            "breadth of coverage: the proportion (in %s) of the "
            "genome covered by at least one read.",
            "the coefficient of variation.",
            "the sequencing depth (Depth of Coverage), that is the average of "
            "the genome coverage.",
            "median of the absolute median deviation defined as median(|X-median(X)|).",
            "Median of the coverage.",
            "standard deviation."
        ]

        if 'gc' in self.df.columns:
            stats['GC'] = self.df['gc'].mean() * 100
            names.append('GC')
            descriptions.append("GC content in %")

        df = pd.DataFrame({
            "name": names,
            "Value": [stats[x] for x in names],
            "Description": descriptions})

        if output == "json":
            return df.to_json()
        else:
            return df