def moving_average(self, n, circular=False): """Compute moving average of the genome coverage :param n: window's size. Must be odd :param bool circular: is the chromosome circular or not Store the results in the :attr:`df` attribute (dataframe) with a column named *ma*. """ N = len(self.df['cov']) assert n < N/2 from sequana.stats import moving_average ret = np.cumsum(np.array(self.df["cov"]), dtype=float) ret[n:] = ret[n:] - ret[:-n] ma = ret[n - 1:] / n mid = int(n / 2) self.df["ma"] = pd.Series(ma, index=np.arange(start=mid, stop=(len(ma) + mid))) if circular: # FIXME: shift of +-1 as compared to non circular case... # shift the data and compute the moving average self.data = list(self.df['cov'].values[N-n:]) +\ list(self.df['cov'].values) + \ list(self.df['cov'].values[0:n]) ma = moving_average(self.data, n) self.ma = ma[n//2+1:-n//2] self.df["ma"] = pd.Series(self.ma, index=self.df['cov'].index)
def to_kmer_content(self, k=7): """Return a Series with kmer count across all reads :param int k: (default to 7-mers) :return: Pandas Series with index as kmer and values as count. Takes about 30 seconds on a million reads. """ # Counter is slow if we apply it on each read. # .count is slow as well import collections from sequana.kmer import get_kmer counter = collections.Counter() pb = Progress(len(self)) buffer_ = [] for i, this in enumerate(self): buffer_.extend(list(get_kmer(this['sequence'], k))) if len(buffer_) > 100000: counter += collections.Counter(buffer_) buffer_ = [] pb.animate(i) counter += collections.Counter(buffer_) ts = pd.Series(counter) ts.sort_values(inplace=True, ascending=False) return ts
def evenness(data): """Return Evenness of the coverage :Reference: Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS. work before or after normalisation but lead to different results. .. math:: C = mean(X) D2 = X[X<=C] N = len(X) n = len(D2) E = 1 - (n - sum(D2) / C) / N """ coverage = pd.Series(data) coverage = coverage.dropna() C = float(round(coverage.mean())) D2 = coverage[coverage <= C] if len(D2) == 0: return 1 else: return 1. - (len(D2) - sum(D2) / C) / len(coverage)
def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1): aa = self.df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: return pd.Series(), self.df aa['group'] = aa.reference_name.apply(lambda x: x[0:shift]) mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"] mapped.name = None if plot: mapped.plot(kind="bar") pylab.title(title) pylab.tight_layout() #data.to_csv(path + "_hq_sirv_grouped.csv") return mapped, self.df
def sirv(self): sirv = [] shift = 5 for df in self.rawdata: aa = df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: sirv.append(pd.Series(index=self.SIRV_names)) continue aa['group'] = aa.reference_name.apply(lambda x: x[0:shift]) # filter quality and flags #mask = np.logical_or(df.flags & 256, df.flags & 2048) #aa = aa.query("mapq>[email protected]_min and @mask") data = aa.groupby("group").count()["mapq"] data.name = None sirv.append(data) return sirv
def plot_sequence_quality(self, max_score=40, ax=None): ymax = max_score + 1 xmax = 0 for sample in self.fastqc_data.keys(): if "per_sequence_quality_scores" in self.fastqc_data[sample]: data = { self._avg_bp_from_range(d['base']): d['mean'] for d in self.fastqc_data[sample] ['per_base_sequence_quality'] } df = pd.Series(data) df.plot(color="k", alpha=0.5) if df.max() > ymax: ymax = df.max() if df.index.max() > xmax: xmax = df.index.max() if ax: pylab.sca(ax) pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.4) pylab.fill_between([0, xmax], [20, 20], [30, 30], color='orange', alpha=0.4) pylab.fill_between([0, xmax], [30, 30], [ymax, ymax], color='green', alpha=0.4) X = range(1, xmax + 1) pylab.ylim([0, ymax]) if xmax != 0: pylab.xlim([0, xmax]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Phred Score", fontsize=12) pylab.grid(axis='x')
def add_stats(self): df = pd.Series(self.summary['read_stats']).to_frame().T df.index = ['read length stats'] table = DataTable(df, "table", index=True) table.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 't', "paging": "false", 'buttons': ['copy', 'csv'] } js = table.create_javascript_function() # IMPORTANT: here conversion to integer with %d # to round and make integer. !! The GC is therefore # converted to integer as well. html_tab = table.create_datatable(float_format='%d') html = "{} {}".format(html_tab, js) self.sections.append({ "name": "Basic stats on read length", "anchor": "table", "content": html })
def plot_jaccard_distance(self, mode, padjs=[0.0001,0.001,0.01,0.05,0.1], Nfc=50, smooth=False, window=5): assert mode in ['down', 'up', 'all'] pylab.clf() if mode == "down": m1 = self.r1.df.log2FoldChange.min() m2 = self.r2.df.log2FoldChange.min() minimum = min(m1,m2) print(m1, m2) X = pylab.linspace(0, minimum, Nfc) elif mode == "up": m1 = self.r1.df.log2FoldChange.max() m2 = self.r2.df.log2FoldChange.max() maximum = max(m1,m2) X = pylab.linspace(0, maximum, Nfc) else: minmax1 = self.r1.df.log2FoldChange.abs().max() minmax2 = self.r2.df.log2FoldChange.abs().max() maximum = max(minmax1, minmax2) X = pylab.linspace(0, maximum, Nfc) common = {} for padj in padjs: I = [] common[padj] = [] for x in X: if mode == "down": # less than a given fold change that is negative A = set(self.r1.df.query("log2FoldChange<=@x and padj<@padj").index) B = set(self.r2.df.query("log2FoldChange<=@x and padj<@padj").index) elif mode == "up": # greater than a given fold change that is positive A = set(self.r1.df.query("log2FoldChange>=@x and padj<@padj").index) B = set(self.r2.df.query("log2FoldChange>=@x and padj<@padj").index) else: A = set(self.r1.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index) B = set(self.r2.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index) if len(A) == 0 or len(B) == 0: # no overlap yet I.append(100) else: res = len(A.intersection(B)) / (len(A) + len(B) - len(A.intersection(B))) * 100 I.append(res) common[padj].append(len(A.intersection(B))) try: if smooth: I = pd.Series(I).rolling(window).median().values else: assert False except: pass pylab.plot(X, I, 'o-', label=str(padj)) ax = pylab.gca() ax.set_ylabel("Jaccard similarity (intersection/union)") ax.set_xlabel("Fold change (log2)") ax2 = ax.twinx() for padj in padjs: ax2.plot(X, common[padj], color='orange', ls='--') ax2.set_ylabel("Cardinality of the union ") ax.legend() ax.set_ylim([0,100]) #ax2.set_ylim([0,100]) if mode == "down": ax.axvline(-2, ls='--', color='r') else: ax.axvline(2, ls='--', color='r')