def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [ pylab.mean([ord(X) - 33 for X in read['quality'].decode()]) for read in self.hq_sequence ] lq_qv = [ pylab.mean([ord(X) - 33 for X in read['quality'].decode()]) for read in self.lq_sequence ] if bins is None: bins = range(0, 94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1 + Y2) ax.plot(X, [N] + list(N - np.cumsum(Y1 + Y2)), "k")
def moving_average(self, n, circular=False): """Compute moving average of the genome coverage :param n: window's size. Must be odd :param bool circular: is the chromosome circular or not Store the results in the :attr:`df` attribute (dataframe) with a column named *ma*. """ N = len(self.df['cov']) assert n < N/2 from sequana.stats import moving_average ret = np.cumsum(np.array(self.df["cov"]), dtype=float) ret[n:] = ret[n:] - ret[:-n] ma = ret[n - 1:] / n mid = int(n / 2) self.df["ma"] = pd.Series(ma, index=np.arange(start=mid, stop=(len(ma) + mid))) if circular: # FIXME: shift of +-1 as compared to non circular case... # shift the data and compute the moving average self.data = list(self.df['cov'].values[N-n:]) +\ list(self.df['cov'].values) + \ list(self.df['cov'].values[0:n]) ma = moving_average(self.data, n) self.ma = ma[n//2+1:-n//2] self.df["ma"] = pd.Series(self.ma, index=self.df['cov'].index)
def N50(data): """Return the N50 value given a list of unsorted/sorted contigs Once the list of contigs is sorted, the N50 is the contig length for which at least half of the nucleotides in the assembly belongs to contigs with the N50 length or longer. """ data = np.sort(data) cdata = np.cumsum(data) return data[np.argmax(cdata > cdata[-1] / 2)]
def L50(data): """Return the smallest number of contigs whose length sum produces N50 :: >>> data = >>> L50(data) 3 """ data = np.sort(data) cdata = np.cumsum(data) pos = np.argmax(cdata > cdata[-1] / 2) return len(data) - pos
def moving_average(data, n): """Compute moving average :param n: window's size (odd or even). :: >>> from sequana.stats import moving_average as ma >>> ma([1,1,1,1,3,3,3,3], 4) array([ 1. , 1.5, 2. , 2.5, 3. ]) .. note:: the final vector does not have the same size as the input vector. """ ret = np.cumsum(data, dtype=float) ret[n:] = ret[n:] - ret[:-n] ma = ret[n - 1:] / n return ma
def plot_all_skews(self, figsize=(10, 12), fontsize=16, alpha=0.5): if self._window is None: raise AttributeError("Please set a valid window to compute skew") # create figure # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize) fig, axarr = pylab.subplots(9, 1, sharex=True, figsize=figsize) main_title = "Window size = %d (%.0f %% of genome )\n\ GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \ % (self._window, self._window*100/self.__len__(), self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100) pylab.suptitle(main_title, fontsize=fontsize) # GC skew axarr[0].set_title("GC skew (blue) - Cumulative sum (red)") axarr[0].plot(list(self._GC_skew_slide[0]), 'b-', alpha=alpha) axarr[0].set_ylabel("(G -C) / (G + C)") axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])), 'r-', alpha=alpha) axarr[1].set_ylabel("(G -C) / (G + C)") # AT skew axarr[2].set_title("AT skew (blue) - Cumulative sum (red)") axarr[2].plot(list(self._AT_skew_slide[0]), 'b-', alpha=alpha) axarr[2].set_ylabel("(A -T) / (A + T)") axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])), 'r-', alpha=alpha) axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0) # Xn axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)") axarr[4].plot(self._Xn, 'g-', alpha=alpha) axarr[4].set_ylabel("(A + G) - (C + T)") # Yn axarr[5].set_title("Cumulative MK skew (Amino - Keto)") axarr[5].plot(self._Yn, 'g-', alpha=alpha) axarr[5].set_ylabel("(A + C) - (G + T)") # Zn axarr[6].set_title( "Cumulative H-bond skew (Weak H-bond - Strong H-bond)") axarr[6].plot(self._Zn, 'g-', alpha=alpha) axarr[6].set_ylabel("(A + T) - (G + C)") # GC content axarr[7].set_title("GC content") axarr[7].plot(list(self._GC_content_slide[0]), 'k-', alpha=alpha) axarr[7].set_ylabel("GC") # AT content axarr[8].set_title("AT content") axarr[8].plot(list(self._AT_content_slide[0]), 'k-', alpha=alpha) axarr[8].set_ylabel("AT") # # FFT # axarr[9].set_title("FFT") # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha) # axarr[9].set_ylabel("FFT") fig.tight_layout() fig.subplots_adjust(top=0.88)
def _compute_skews(self): ### initialisation = Calculating GC skew and AT skew for first window self._init_sliding_window() GC_content_slide, GC_skew_slide = self._init_list_results() AT_content_slide, AT_skew_slide = self._init_list_results() self._init_cumul_nuc() c = Counter(self._slide_window) dict_counts = {'G': c['G'], 'C': c['C'], 'A': c['A'], 'T': c['T']} i = 0 # GC sumGC = float(dict_counts['G'] + dict_counts['C']) GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT sumAT = float(dict_counts['A'] + dict_counts['T']) AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT ### Compute for all genome while (self._seq_right): out_nuc = self._slide_window.popleft() in_nuc = self._seq_right.popleft() self._slide_window.append(in_nuc) i += 1 if i % 500000 == 0: logger.info("%d / %d" % (i, self.__len__())) # if in and out are the same : do nothing, append same result if out_nuc != in_nuc: # remove out from counters if out_nuc in self._dict_nuc: dict_counts[out_nuc] -= 1 if in_nuc in self._dict_nuc: dict_counts[in_nuc] += 1 sumGC = float(dict_counts['G'] + dict_counts['C']) sumAT = float(dict_counts['A'] + dict_counts['T']) # fill results # GC GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT # cumul if in_nuc in self._dict_nuc: self._cumul[self._dict_nuc[in_nuc]][i + self._window - 1] += 1 self._GC_content_slide = GC_content_slide / float(self._window) self._AT_content_slide = AT_content_slide / float(self._window) self._cumul = np.delete(self._cumul, range(self.__len__(), self._cumul.shape[1]), 1) self._cumul = np.cumsum(self._cumul, axis=1) ### save result for Z curve self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']])) self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\ (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']])) self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']])) self._AT_skew_slide = AT_skew_slide self._GC_skew_slide = GC_skew_slide ### check proportion of ignored nucleotides GC_content_total = (self._cumul[self._dict_nuc['G']][-1] + self._cumul[self._dict_nuc['C']][-1]) / float( self.__len__()) AT_content_total = (self._cumul[self._dict_nuc['A']][-1] + self._cumul[self._dict_nuc['T']][-1]) / float( self.__len__()) self._ignored_nuc = 1.0 - GC_content_total - AT_content_total
def plot_all_skews(self,figsize=(10, 12), fontsize=16, alpha=0.5): if self._window is None: raise AttributeError("Please set a valid window to compute skew") # create figure # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize) fig, axarr = pylab.subplots(9,1, sharex=True, figsize=figsize) main_title = "Window size = %d (%.0f %% of genome )\n\ GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \ % (self._window, self._window*100/self.__len__(), self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100) pylab.suptitle(main_title, fontsize=fontsize) # GC skew axarr[0].set_title("GC skew (blue) - Cumulative sum (red)") axarr[0].plot(list(self._GC_skew_slide[0]),'b-',alpha=alpha) axarr[0].set_ylabel("(G -C) / (G + C)") axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])),'r-',alpha=alpha) axarr[1].set_ylabel("(G -C) / (G + C)") # AT skew axarr[2].set_title("AT skew (blue) - Cumulative sum (red)") axarr[2].plot(list(self._AT_skew_slide[0]),'b-',alpha=alpha) axarr[2].set_ylabel("(A -T) / (A + T)") axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])),'r-',alpha=alpha) axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0) # Xn axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)") axarr[4].plot(self._Xn,'g-',alpha=alpha) axarr[4].set_ylabel("(A + G) - (C + T)") # Yn axarr[5].set_title("Cumulative MK skew (Amino - Keto)") axarr[5].plot(self._Yn,'g-',alpha=alpha) axarr[5].set_ylabel("(A + C) - (G + T)") # Zn axarr[6].set_title("Cumulative H-bond skew (Weak H-bond - Strong H-bond)") axarr[6].plot(self._Zn,'g-',alpha=alpha) axarr[6].set_ylabel("(A + T) - (G + C)") # GC content axarr[7].set_title("GC content") axarr[7].plot(list(self._GC_content_slide[0]),'k-',alpha=alpha) axarr[7].set_ylabel("GC") # AT content axarr[8].set_title("AT content") axarr[8].plot(list(self._AT_content_slide[0]),'k-',alpha=alpha) axarr[8].set_ylabel("AT") # # FFT # axarr[9].set_title("FFT") # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha) # axarr[9].set_ylabel("FFT") fig.tight_layout() fig.subplots_adjust(top=0.88)
def _compute_skews(self): ### initialisation = Calculating GC skew and AT skew for first window self._init_sliding_window() GC_content_slide, GC_skew_slide = self._init_list_results() AT_content_slide, AT_skew_slide = self._init_list_results() self._init_cumul_nuc() c = Counter(self._slide_window) dict_counts = {'G' : c['G'], 'C' : c['C'], 'A' : c['A'], 'T' : c['T']} i = 0 # GC sumGC = float(dict_counts['G'] + dict_counts['C']) GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT sumAT = float(dict_counts['A'] + dict_counts['T']) AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT ### Compute for all genome while(self._seq_right): out_nuc = self._slide_window.popleft() in_nuc = self._seq_right.popleft() self._slide_window.append(in_nuc) i += 1 if i % 500000 == 0: logger.info("%d / %d" % (i, self.__len__())) # if in and out are the same : do nothing, append same result if out_nuc != in_nuc: # remove out from counters if out_nuc in self._dict_nuc: dict_counts[out_nuc] -= 1 if in_nuc in self._dict_nuc: dict_counts[in_nuc] += 1 sumGC = float(dict_counts['G'] + dict_counts['C']) sumAT = float(dict_counts['A'] + dict_counts['T']) # fill results # GC GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C'])/sumGC # AT AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T'])/sumAT # cumul if in_nuc in self._dict_nuc: self._cumul[self._dict_nuc[in_nuc]][i+self._window-1] +=1 self._GC_content_slide = GC_content_slide/float(self._window) self._AT_content_slide = AT_content_slide/float(self._window) self._cumul = np.delete(self._cumul, range(self.__len__(),self._cumul.shape[1]),1) self._cumul = np.cumsum(self._cumul,axis=1) ### save result for Z curve self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']])) self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\ (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']])) self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']])) self._AT_skew_slide = AT_skew_slide self._GC_skew_slide = GC_skew_slide ### check proportion of ignored nucleotides GC_content_total = (self._cumul[self._dict_nuc['G']][-1] + self._cumul[self._dict_nuc['C']][-1]) / float(self.__len__()) AT_content_total = (self._cumul[self._dict_nuc['A']][-1] + self._cumul[self._dict_nuc['T']][-1]) / float(self.__len__()) self._ignored_nuc = 1.0 - GC_content_total - AT_content_total