def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0,94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1+Y2) ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames) - (bar_width / 2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" % sum(nb_res_ORF)) pylab.bar(np.array(frames) + (bar_width / 2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" % sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def plot_percentage_null_read_counts(self): """ Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples. .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ N = len(self.sample_names) data = (self.df[self.sample_names]==0).sum() data = data / len(self.df) * 100 all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum() pylab.clf() pylab.bar(range(N), data) pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k") pylab.xticks(range(N), self.sample_names) pylab.xlabel("Sample")
def histogram_sequence_lengths(self, logy=True): """Histogram sequence lengths .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_sequence_lengths() """ data = [len(x) for x in self.sequences] bary, barx = np.histogram(data, bins=range(max(data)+1)) # get rid of zeros to avoid warnings bx = [x for x,y in zip(barx, bary) if y!=0] by = [y for x,y in zip(barx, bary) if y!=0] if logy: pylab.bar(bx, pylab.log10(by)) else: pylab.bar(bx, by) pylab.xlim([1,max(data)+1]) pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = self.sample_names N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.bar(range(N), (dd/1000000).values, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(range(N), self.sample_names)
def hist_ZMW_subreads(self, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_ZMW_subreads() """ if self._nb_pass is None: self._get_ZMW_passes() max_nb_pass = max(self._nb_pass.keys()) k = range(1, max_nb_pass + 1) val = [self._nb_pass[i] for i in k] # histogram nb passes if hold is False: pylab.clf() pylab.bar(k, val, alpha=alpha, label=label, log=logy) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def bar_plot_contigs_length(self): # show length of N contigs as compare to length of the reference fref = FastA(self.reference) Nref = len(fref.sequences) N = len(self.fasta) pylab.clf() pylab.bar(range(0, N, int(pylab.ceil(N / Nref))), sorted(fref.lengths), width=Nref / 1.1, label="Plasmodium chromosomes") pylab.bar(range(0, N), sorted(self.fasta.lengths), width=1, label="canu {} contigs".format(N)) pylab.legend()
def hist_average_quality(self, fontsize=16): hq_qv = [ mean([phred.ascii_to_quality(X) for X in read['quality'].decode()]) for read in iso.hq_sequence ] lq_qv = [ mean([phred.ascii_to_quality(X) for X in read['quality'].decode()]) for read in iso.lq_sequence ] Y1, X = numpy.histogram(hq_qv, bins=range(0, 94)) Y2, X = numpy.histogram(lq_qv, bins=range(0, 94)) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize)
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF)) pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def barplot(self, filename="lane{}_status.png", lanes=None): df = self.get_data_reads() if lanes is None: lanes = df.lane.unique() for lane in lanes: pylab.clf() query = "lane==@lane and name!='Undetermined'" counts = df.query(query)['count'] total = counts.sum() L = len(counts) query = "lane==@lane and name=='Undetermined'" under = df.query(query)['count'].sum() if total > 0: pylab.bar(range(L), counts, color="b", label="reads") if total == 0: color = "red" else: if 100 * under / total < 20: color = "green" elif 100 * under / total < 50: color = "orange" else: color = "red" pylab.bar(range(L, L + 1), under, color=color, label="undetermined") pylab.xticks([]) pylab.ylabel("Number of reads") try: pylab.legend(loc="lower left") except: pass pylab.title("Lane {}".format(lane)) pylab.savefig(filename.format(lane), dpi=200)
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = [x for x in self.df.columns if x.startswith("norm")] sample_names = [x.replace("norm.", "") for x in sample_names] N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() pylab.bar(range(N), (dd/1000000).values, color=['r']*3+['b']*3, alpha=1) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True) pylab.title("Total read count per sample", fontsize=fontsize)
def plot_percentage_null_read_counts(self): """ Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples. .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ N = len(self.sample_names) data = (self.df[self.sample_names]==0).sum() data = data / len(self.df) * 100 all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.clf() pylab.bar(range(N), data, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k", zorder=20) pylab.xticks(range(N), self.sample_names) pylab.xlabel("Sample") pylab.ylabel("Proportion of null counts (%)") pylab.grid(True, zorder=0)
def plot_count_per_sample(self, fontsize=12, rotation=45): """Number of mapped and annotated reads (i.e. counts) per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ pylab.clf() df = self.counts_raw.sum().rename("total_counts") df = pd.concat([self.design_df, df], axis=1) pylab.bar( df.index, df.total_counts / 1000000, color=df.group_color, lw=1, zorder=10, ec="k", width=0.9, ) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("reads (M)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(rotation=rotation, ha="right") # pylab.xticks(range(N), self.sample_names) try: pylab.tight_layout() except: pass
def plot_percentage_null_read_counts(self): """Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ pylab.clf() # how many null counts ? df = (self.counts_raw == 0).sum() / self.counts_raw.shape[0] * 100 df = df.rename("percent_null") df = pd.concat([self.design_df, df], axis=1) pylab.bar(df.index, df.percent_null, color=df.group_color, ec="k", lw=1, zorder=10) all_null = (self.counts_raw == 0).all(axis=1).sum() / self.counts_raw.shape[0] pylab.axhline(all_null, ls="--", color="black", alpha=0.5) pylab.xticks(rotation=45, ha="right") pylab.ylabel("Proportion of null counts (%)") pylab.grid(True, zorder=0) pylab.tight_layout()