def hist_concordance(self, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance() concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def hist_concordance(self, method, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance(method) concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def get_stats(self, output="json"): """Return basic stats about the coverage data""" data = self.df stats = { 'DOC': self.df['cov'].mean(), 'STD': self.df['cov'].std(), 'Median': self.df['cov'].median(), 'BOC': 100 * sum(self.df['cov'] > 0) / float(len(self.df))} try: stats['CV'] = stats['STD'] / stats['DOC'] except: stats['CV'] = np.nan stats['MAD'] = np.median(abs(data['cov'].median() - data['cov']).dropna()) names = ['BOC', 'CV', 'DOC', 'MAD', 'Median', 'STD'] descriptions = [ "breadth of coverage: the proportion (in %s) of the " "genome covered by at least one read.", "the coefficient of variation.", "the sequencing depth (Depth of Coverage), that is the average of " "the genome coverage.", "median of the absolute median deviation defined as median(|X-median(X)|).", "Median of the coverage.", "standard deviation." ] if 'gc' in self.df.columns: stats['GC'] = self.df['gc'].mean() * 100 names.append('GC') descriptions.append("GC content in %") df = pd.DataFrame({ "name": names, "Value": [stats[x] for x in names], "Description": descriptions}) if output == "json": return df.to_json() else: return df