def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def imshow_qualities(self): """Qualities :: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.imshow_qualities() from pylab import tight_layout; tight_layout() """ tiles = self._get_tile_info() d = defaultdict(list) for tile, seq in zip(tiles['tiles'], self.qualities): d[tile].append(seq) self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())] from biokit.viz import Imshow im = Imshow(self.data_imqual) im.plot(xticks_on=False, yticks_on=False, origin='lower') pylab.title("Quality per tile", fontsize=self.fontsize) pylab.xlabel("Position in read (bp)") pylab.ylabel("tile number")
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, density=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) from sequana.misc import normpdf pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def histogram_sequence_lengths(self, logy=True): """Histogram sequence lengths .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_sequence_lengths() """ data = [len(x) for x in self.sequences] bary, barx = np.histogram(data, bins=range(max(data)+1)) # get rid of zeros to avoid warnings bx = [x for x,y in zip(barx, bary) if y!=0] by = [y for x,y in zip(barx, bary) if y!=0] if logy: pylab.bar(bx, pylab.log10(by)) else: pylab.bar(bx, by) pylab.xlim([1,max(data)+1]) pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
def plot_bar_flags(self, logy=True, fontsize=16, filename=None): """Plot an histogram of the flags contained in the BAM .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_flags() .. seealso:: :class:`SAMFlags` for meaning of each flag """ df = self.get_flags_as_df() df = df.sum() pylab.clf() if logy is True: barplot = df.plot(kind='bar', logy=logy, grid=True) else: barplot = df.plot(kind='bar', grid=True) pylab.xlabel("flags", fontsize=fontsize) pylab.ylabel("count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename) return barplot
def hist_concordance(self, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance() concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0,94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1+Y2) ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0,limits[0]-1), limits[1]+1)
def boxplot_mapq_concordance(self): # method can only be bwa for now assert self.method == "bwa" data = self._get_data() df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10,20,30,40,50,60] pylab.xticks(tt, tt)
def hist_isoform_length_mapped_vs_unmapped(self, bins=None): df = self.df if bins is None: bins = range(0, len(df.reference_length.max()), 100) mapped = df[df.reference_name != -1] unmapped = df[df.reference_name == -1] pylab.hist(mapped.reference_length, bins=bins, alpha=0.5, label="mapped {}".format(len(mapped)), density=False) pylab.hist(unmapped.reference, bins=bins, alpha=0.5, label="unmapped {}".format(len(unmapped)), density=False) pylab.xlabel("Isoform length") pylab.legend()
def plot_indel_dist(self, fontsize=16): """Plot indel count (+ ratio) :Return: list of insertions, deletions and ratio insertion/deletion for different length starting at 1 .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_indel_dist() What you see on this figure is the presence of 10 insertions of length 1, 1 insertion of length 2 and 3 deletions of length 1 # Note that in samtools, several insertions or deletions in a single alignment are ignored and only the first one seems to be reported. For instance 10M1I10M1I stored only 1 insertion in its report; Same comment for deletions. .. todo:: speed up and handle long reads cases more effitiently by storing INDELS as histograms rather than lists """ try: self.insertions except: self._set_indels() if len(self.insertions) ==0 or len(self.deletions) == 0: raise ValueError("No deletions or insertions found") N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1 D = [self.deletions.count(i) for i in range(N)] I = [self.insertions.count(i) for i in range(N)] R = [i/d if d!=0 else 0 for i,d in zip(I, D)] fig, ax = pylab.subplots() ax.plot(range(N), I, marker="x", label="Insertions") ax.plot(range(N), D, marker="x", label="Deletions") ax.plot(range(N), R, "--r", label="Ratio insertions/deletions") ax.set_yscale("symlog") pylab.ylim([1, pylab.ylim()[1]]) pylab.legend() pylab.grid() from matplotlib.ticker import MaxNLocator ax.xaxis.set_major_locator(MaxNLocator(integer=True)) pylab.xlabel("Indel length", fontsize=fontsize) pylab.ylabel("Indel count", fontsize=fontsize) return I, D, R
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def plot_coverage(self): """Please use :class:`GenomeCov` for more sophisticated tools to plot the genome coverage .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_coverage() """ try: self.coverage except: self._set_coverage() pylab.plot(self.coverage) pylab.xlabel("Coverage")
def hist_coverage(self, bins=100): """ .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.hist_coverage() """ try: self.coverage except: self._set_coverage() pylab.hist(self.coverage, bins=bins) pylab.xlabel("Coverage") pylab.ylabel("Number of mapped bases") pylab.grid()
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#", logy=True): """Plots histogram of the repeat lengths """ # check that user has set a threshold if hold is False: pylab.clf() pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) if logy: pylab.semilogy()
def plot_read_length(self): """Plot occurences of aligned read lengths .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("test.bam")) b.plot_read_length() """ X, Y = self._get_read_length() pylab.plot(X, Y, label="min length:{}; max length:{}".format(min(X), max(X))) pylab.grid() pylab.xlabel("Read length", fontsize=16) pylab.legend()
def histogram_gc_content(self): """Plot histogram of GC content .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_gc_content() """ pylab.hist(self.gc_list, bins=range(0, 100)) pylab.grid() pylab.title("GC content distribution (per sequence)") pylab.xlabel(r"Mean GC content (%)", fontsize=self.fontsize) pylab.xlim([0,100])
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF)) pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="",title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:,'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" %(mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:,'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except:pass
def plot_bar_mapq(self, fontsize=16, filename=None, ): """Plots bar plots of the MAPQ (quality) of alignments .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_mapq() """ df = self.get_mapq_as_df() df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False, grid=True, logy=True) pylab.xlabel("MAPQ", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename)
def plot_acgt_content(self, stacked=False): """Plot histogram of GC content .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.plot_acgt_content() """ df = self.get_actg_content() if stacked is True: df.plot.bar(stacked=True) else: df.plot() pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("percent", fontsize=self.fontsize)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass+1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_bar_grouped(self, normalise=False, ncol=2, N=None): """ :param normalise: :param ncol: columns in the legend """ if N is not None: N = np.array(N) else: N = np.array([len(x) for x in self.rawdata]) dd = pd.DataFrame(self.sirv).T if normalise: dd = dd/ (N/max(N)) dd.columns = self.labels dd.plot(kind="bar") pylab.xlabel("") pylab.legend(self.labels, ncol=ncol) pylab.tight_layout() return dd
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)]) #acceptance probability u = pylab.uniform(0,1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, density=1) pylab.plot(x,y,'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF','Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot(self, bins=80, rwidth=0.8, **kwargs): pylab.clf() Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs) pylab.xlabel(self.xlabel, fontsize=self.fontsize) pylab.ylabel(self.ylabel, fontsize=self.fontsize) """self.Y = Y self.X = X ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=self.fontsize) """ pylab.grid(self.grid) pylab.title(self.title) try: pylab.tight_layout() except:pass
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def _format_plot(self, title="", xlabel="", ylabel="", rotation=0): pylab.title(title) pylab.xticks(rotation=rotation, ha="right") pylab.xlabel(xlabel) pylab.ylabel(ylabel)
def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12): self.df.mapq.hist() if logy: pylab.semilogy() pylab.xlim([xmin, xmax]) pylab.xlabel("Mapping quality", fontsize=fontsize)
def hist_passes(self, maxp=50, fontsize=16): passes = self.df.nb_passes.copy() passes.clip(upper=maxp).hist(bins=maxp) pylab.xlim([0, maxp]) pylab.ylabel("# count", fontsize=fontsize) pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)
def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data/data.sum()*100 assert threshold > 0 and threshold < 100 others = data[data<threshold].sum() data = data[data>threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10,8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data
def plot_coverage(self, filename=None, fontsize=16, rm_lw=1, rm_color="#0099cc", rm_label="Running median", th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1, main_kwargs={}, sample=True, set_ylimits=True): """ Plot coverage as a function of base position. :param filename: :param rm_lw: line width of the running median :param rm_color: line color of the running median :param rm_color: label for the running median :param th_lw: line width of the thresholds :param th_color: line color of the thresholds :param main_color: line color of the coverage :param main_lw: line width of the coverage :param sample: if there are more than 1 000 000 points, we use an integer step to skip data points. We can still plot all points at your own risk by setting this option to False :param set_ylimits: we want to focus on the "normal" coverage ignoring unsual excess. To do so, we set the yaxis range between 0 and a maximum value. This maximum value is set to the minimum between the 6 times the mean coverage and 1.5 the maximum of the high coverage threshold curve. If you want to let the ylimits free, set this argument to False .. note:: if there are more than 1,000,000 points, we show only 1,000,000 by points. For instance for 5,000,000 points, In addition to the coverage, the running median and coverage confidence corresponding to the lower and upper zscore thresholds are shown. .. note:: uses the thresholds attribute. """ # z = (X/rm - \mu ) / sigma high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * self.df["rm"] low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * self.df["rm"] pylab.clf() ax = pylab.gca() ax.set_facecolor('#eeeeee') pylab.xlim(0,self.df["pos"].iloc[-1]) axes = [] labels = [] # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1 # million points for now. if len(self.df) > 1000000 and sample is True: NN = int(len(self.df)/1000000) else: NN = 1 # the main coverage plot p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage", linewidth=main_lw, **main_kwargs) axes.append(p1) labels.append("Coverage") # The running median plot if rm_lw > 0: p2, = pylab.plot(self.df["rm"][::NN], color=rm_color, linewidth=rm_lw, label=rm_label) axes.append(p2) labels.append(rm_label) # The threshold curves if th_lw > 0: p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls, label="Thresholds") p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls, label="_nolegend_") axes.append(p3) labels.append("Thresholds") pylab.legend(axes, labels, loc="best") pylab.xlabel("Position", fontsize=fontsize) pylab.ylabel("Per-base coverage", fontsize=fontsize) pylab.grid(True) # sometimes there are large coverage value that squeeze the plot. # Let us restrict it if set_ylimits is True: pylab.ylim([0, min([ high_zcov.max() * 1.5, self.df["cov"].mean()*6])]) else: pylab.ylim([0, pylab.ylim()[1]]) try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_common_major_counts(self, mode, labels=None, switch_up_down_cond2=False, add_venn=True, xmax=None, title="", fontsize=12, sortby="log2FoldChange"): """ :param mode: down, up or all .. plot:: :include-source: from sequana import sequana_data from sequana.compare import RNADiffCompare c = RNADiffCompare( sequana_data("rnadiff/rnadiff_onecond_1"), sequana_data("rnadiff/rnadiff_onecond_2")) c.plot_common_major_counts("down") """ #cond1, cond2 = self._get_cond1_cond2() if labels is None: labels = ['r1', 'r2'] if mode in ["down"]: # Negative values ! gl1 = set(self.r1.gene_lists['down']) gl2 = set(self.r2.gene_lists['down']) A = self.r1.df.loc[gl1].sort_values(by=sortby) B = self.r2.df.loc[gl1].sort_values(by=sortby) else: gl1 = set(self.r1.gene_lists[mode]) gl2 = set(self.r2.gene_lists[mode]) A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False) B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False) # sometimes, up and down may be inverted as compared to the other # conditions N = [] for i in range(1,max(len(A), len(B))): a = A.iloc[0:i].index b = B.iloc[0:i].index n = len(set(b).intersection(set(a))) N.append(n / i*100) max_common = len(set(A.index).intersection(set(B.index))) pylab.clf() if len(A) > len(B): pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection") pylab.axvline(len(B), ls="--", color="k", label="rank of minor set") else: pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect") pylab.axvline(len(A), ls="--", color="k", label="rank of minor set") pylab.plot(N) pylab.xlabel('rank', fontsize=fontsize) pylab.ylabel('% common features', fontsize=fontsize) pylab.grid(True) pylab.ylim([0,100]) if xmax: pylab.xlim([0, xmax]) else: pylab.xlim([0, max(len(A),len(B))]) pylab.title(title, fontsize=fontsize) ax = pylab.gca() ax2 = ax.twinx() ax2.plot(A[sortby].values, "orange", label=sortby) ax2.set_ylabel(sortby) pylab.legend(loc="lower left") ax.legend(loc="lower right") if add_venn: f = pylab.gcf() ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey") if mode=="down": self.plot_venn_down(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="up": self.plot_venn_up(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="all": self.plot_venn_all(ax=ax, title=None, labels=labels, mode="two_only")
def hist_period_size(self, bins=50): """Length of the repetitions""" self.df.period_size.hist(bins=bins) pylab.xlabel("repeat length")
def hist_repet_by_sequence(self): # How many repetitions per sequence pylab.hist( [len(x) for x in self.df.groupby("sequence_name").groups.values()]) pylab.xlabel("# repetitions per sequence")
def plot(self, kind="pie", cmap="tab20c", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. .. todo:: For a future release, we could use this kind of plot https://stackoverflow.com/questions/57720935/how-to-use-correct-cmap-colors-in-nested-pie-chart-in-matplotlib """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_db(list(self.taxons.index)) # we add the unclassified only if needed if self.unclassified > 0: df.loc[-1] = ["Unclassified"] * 8 data = self.taxons.copy() # we add the unclassified only if needed if self.unclassified > 0: data.loc[-1] = self.unclassified data = data / data.sum() * 100 assert threshold > 0 and threshold < 100 # everything below the threshold (1) is gather together and summarised # into 'others' others = data[data < threshold].sum() data = data[data >= threshold] names = df.loc[data.index]['name'] data.index = names.values if others > 0: data.loc['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) pylab.figure(figsize=(10, 8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data
def hist_passes(self, maxp=50, fontsize=16): passes = self.df.nb_passes.copy() passes.clip_upper(maxp).hist(bins=maxp) pylab.xlim([0, maxp]) pylab.ylabel("# count", fontsize=fontsize) pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)
def plot_volcano(self, labels=None): """Volcano plot of log2 fold change versus log10 of adjusted p-value .. plot:: :include-source: from sequana import sequana_data from sequana.compare import RNADiffCompare c = RNADiffCompare( sequana_data("rnadiff/rnadiff_onecond_1"), sequana_data("rnadiff/rnadiff_onecond_2")) c.plot_volcano() """ cond1, cond2 = "cond1", "cond2" if labels is None: labels = [cond1, cond2] A = self.r1.df.loc[self.r1.gene_lists["all"]] B = self.r2.df.loc[self.r2.gene_lists["all"]] if cond1 == cond2: cond1 += "(1)" cond2 += "(2)" pylab.clf() pylab.plot(A.log2FoldChange, -np.log10(A.padj), marker="o", alpha=0.5, color="r", lw=0, label=labels[0], pickradius=4, picker=True) pylab.plot(B.log2FoldChange, -np.log10(B.padj), marker="x", alpha=0.5, color="k", lw=0, label=labels[1], pickradius=4, picker=True) genes = list(A.index) + list(B.index) pylab.grid(True) pylab.xlabel("fold change") pylab.ylabel("log10 adjusted p-value") pylab.legend(loc="lower right") ax = pylab.gca() def onpick(event): thisline = event.artist self.event = event label = thisline.get_label() if label == cond1: gene_name = A.index[event.ind[0]] x1 = round(A.loc[gene_name].log2FoldChange,1) y1 = round(-np.log10(A.loc[gene_name].padj),1) try: x2 = round(B.loc[gene_name].log2FoldChange,1) y2 = round(-np.log10(B.loc[gene_name].padj),1) except: x2, y2 = None, None else: gene_name = B.index[event.ind[0]] x1 = round(B.loc[gene_name].log2FoldChange,1) y1 = round(-np.log10(B.loc[gene_name].padj),1) try: x2 = round(A.loc[gene_name].log2FoldChange,1) y2 = round(-np.log10(A.loc[gene_name].padj),1) except: x2, y2 = None, None try: if x2 is None: ax.title.set_text("{} at pos [{},{}]".format( gene_name,x1,y1)) else: ax.title.set_text("{} at pos [{},{}] and [{},{}]".format( gene_name,x1,y1,x2,y2)) except: print("exception") ax.title.set_text("") pylab.draw() fig = pylab.gcf() fig.canvas.mpl_connect('pick_event', onpick)
sequence_reference) print("reference sequence loaded") x = range(100, 10000, 300) res_time = [] for i in x: time1 = time.clock() alignment_best = sequence_reference_2x(sequence_reference[0:i]) time2 = time.clock() print("Align %d pb in %f, score = %d" % (i, time2 - time1, alignment_best.optimal_alignment_score)) res_time.append(time2 - time1) df_res_time = pd.DataFrame([x, res_time]) df_res_time = df_res_time.transpose() df_res_time.columns = ["len_seq", "time_minutes"] df_res_time["time_minutes"] = df_res_time["time_minutes"] / float(60) df_res_time.to_csv("2017_03_20_SW_time.csv") pylab.plot(df_res_time["len_seq"], df_res_time["time_minutes"], "b-") pylab.xlabel("Length of aligned sequence") pylab.ylabel("Time (minutes)") pylab.title("Time for Smith waterman alignment with scikit 0.5.1") pylab.show() estim = df_res_time["time_minutes"].iloc[-1] * len_genome / float( df_res_time["len_seq"].iloc[-1] * 60) # in minutes print("Time estimation (linear) for contig of %d = %f minutes (%f hours)" % (len_genome, estim, estim / 60))
for i in range(len(list_analysis)): analysis = list_analysis[i] res = compute_table_performance(analysis, df_results) print("%s" % analysis) # [TP, FP, FN, TN] # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]])) TP = res[0] FP = res[1] FN = [0] * res[2] TN = [0] * res[3] y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) + [0] * len(TN)) y_scores = np.array(TP + FN + FP + TN) precision, recall, thresholds = precision_recall_curve(y_true, y_scores) pylab.plot(recall, precision, color=colors[i], label=analysis) pylab.xlabel('Recall') pylab.ylabel('Precision') pylab.ylim([0.0, 1.05]) pylab.xlim([0.0, 1.05]) pylab.title('Precision-Recall') #pylab.legend(loc="lower left") lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) #pylab.tight_layout() if file_fig != "show": pylab.savefig(file_fig, bbox_extra_artists=(lgd, ), bbox_inches='tight') else: pylab.show()
def plot_pvalue_hist(self, bins=60, fontsize=16, rotation=0): pylab.hist(self.df.pvalue.dropna(), bins=bins, ec="k") pylab.xlabel("raw p-value") pylab.ylabel("Occurences")
def plot_go_terms(self, ontologies, max_features=50, log=False, fontsize=8, minimum_genes=0, pvalue=0.05, cmap="summer_r", sort_by="fold_enrichment", show_pvalues=False, include_negative_enrichment=False, fdr_threshold=0.05, compute_levels=True, progress=True): assert sort_by in ['pValue', 'fold_enrichment', 'fdr'] # FIXME: pvalue and fold_enrichment not sorted in same order pylab.clf() df = self.get_data( ontologies, include_negative_enrichment=include_negative_enrichment, fdr=fdr_threshold) if len(df) == 0: return df df = df.query("pValue<=@pvalue") logger.info("Filtering out pvalue>{}. Kept {} GO terms".format( pvalue, len(df))) df = df.reset_index(drop=True) # Select a subset of the data to keep the best max_features in terms of # pValue subdf = df.query("number_in_list>@minimum_genes").copy() logger.info( "Filtering out GO terms with less than {} genes: Kept {} GO terms". format(minimum_genes, len(subdf))) logger.info("Filtering out the 3 parent terms") subdf = subdf.query("id not in @self.ontologies") # Keeping only a part of the data, sorting by pValue if sort_by == "pValue": subdf = subdf.sort_values(by="pValue", ascending=False).iloc[-max_features:] df = df.sort_values(by="pValue", ascending=False) elif sort_by == "fold_enrichment": subdf = subdf.sort_values(by="abs_log2_fold_enrichment", ascending=True).iloc[-max_features:] df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False) elif sort_by == "fdr": subdf = subdf.sort_values(by="fdr", ascending=False).iloc[-max_features:] df = df.sort_values(by="fdr", ascending=False) subdf = subdf.reset_index(drop=True) # We get all levels for each go id. # They are stored by MF, CC or BP if compute_levels: paths = self.get_graph(list(subdf['id'].values), progress=progress) levels = [] keys = list(paths.keys()) goid_levels = paths[keys[0]] if len(keys) > 1: for k in keys[1:]: goid_levels.update(paths[k]) levels = [goid_levels[ID] for ID in subdf['id'].values] subdf["level"] = levels else: subdf['level'] = "" N = len(subdf) size_factor = 12000 / len(subdf) max_size = subdf.number_in_list.max() min_size = subdf.number_in_list.min() sizes = [ max(max_size * 0.2, x) for x in size_factor * subdf.number_in_list.values / subdf.number_in_list.max() ] m1 = min(sizes) m3 = max(sizes) m2 = m1 + (m3 - m1) / 2 if log: pylab.scatter(pylab.log2(subdf.fold_enrichment), range(len(subdf)), c=subdf.fdr, s=sizes, cmap=cmap, alpha=0.8, ec="k", vmin=0, vmax=fdr_threshold, zorder=10) #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r", # label="pvalue>0.05; FDR>0.05") #pylab.axvline(1, color="gray", ls="--") #pylab.axvline(-1, color="gray", ls="--") else: pylab.scatter(subdf.fold_enrichment, range(len(subdf)), c=subdf.fdr, cmap=cmap, s=sizes, ec="k", alpha=.8, vmin=0, vmax=fdr_threshold, zorder=10) # pylab.barh(range(N), subdf.fold_enrichment, color="r", # label="not significant") pylab.grid(zorder=-10) ax2 = pylab.colorbar(shrink=0.5) ax2.ax.set_ylabel('FDR') labels = [ x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label) ] ticks = [ "{} ({}) {}".format(ID, level, "; " + label.title()) for level, ID, label in zip(subdf['level'], subdf.id, labels) ] pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left') yax = pylab.gca().get_yaxis() try: pad = [x.label.get_window_extent().width for x in yax.majorTicks] yax.set_tick_params(pad=max(pad)) except: yax.set_tick_params(pad=60 * fontsize * 0.7) yax.set_tick_params(pad=60 * fontsize * 0.6) fc_max = subdf.fold_enrichment.max(skipna=True) fc_min = subdf.fold_enrichment.min(skipna=True) # go into log2 space fc_max = pylab.log2(fc_max) fc_min = pylab.log2(fc_min) abs_max = max(fc_max, abs(fc_min), 1) if log: fc_max = abs_max * 1.5 else: fc_max = 2**abs_max * 1.2 pylab.axvline(0, color="k", lw=2) if log: pylab.xlabel("Fold Enrichment (log2)") else: pylab.xlabel("Fold Enrichment") if include_negative_enrichment: pylab.xlim([-fc_max, fc_max]) else: pylab.xlim([0, fc_max]) pylab.tight_layout() # The pvalue: if show_pvalues: ax = pylab.gca().twiny() ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2]) ax.set_xlabel("p-values (log10)", fontsize=12) ax.plot(-pylab.log10(subdf.pValue), range(len(subdf)), label="pvalue", lw=2, color="k") ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05") pylab.tight_layout() pylab.legend(loc="lower right") s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k") s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k") s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k") if len(subdf) < 10: labelspacing = 1.5 * 4 borderpad = 4 handletextpad = 2 elif len(subdf) < 20: labelspacing = 1.5 * 2 borderpad = 1 handletextpad = 2 else: labelspacing = 1.5 borderpad = 2 handletextpad = 2 if len(subdf) >= 3: leg = pylab.legend( (s1, s2, s3), (str(int(min_size)), str(int(min_size + (max_size - min_size) / 2)), str(int(max_size))), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) else: leg = pylab.legend((s1, ), (str(int(min_size)), ), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) frame = leg.get_frame() frame.set_facecolor('#b4aeae') frame.set_edgecolor('black') frame.set_alpha(1) self.subdf = subdf self.df = df return df
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:, "snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:, 'snr_A'].clip(upper=maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'].clip(upper=maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'].clip(upper=maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'].clip(upper=maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot(self, bins=100, cmap="hot_r", fontsize=10, Nlevels=4, xlabel=None, ylabel=None, norm=None, range=None, normed=False, colorbar=True, contour=True, grid=True, **kargs): """plots histogram of mean across replicates versus coefficient variation :param int bins: binning for the 2D histogram (either a float or list of 2 binning values). :param cmap: a valid colormap (defaults to hot_r) :param fontsize: fontsize for the labels :param int Nlevels: must be more than 2 :param str xlabel: set the xlabel (overwrites content of the dataframe) :param str ylabel: set the ylabel (overwrites content of the dataframe) :param norm: set to 'log' to show the log10 of the values. :param normed: normalise the data :param range: as in pylab.Hist2D : a 2x2 shape [[-3,3],[-4,4]] :param contour: show some contours (default to True) :param bool grid: Show unerlying grid (defaults to True) If the input is a dataframe, the xlabel and ylabel will be populated with the column names of the dataframe. """ X = self.df[self.df.columns[0]].values Y = self.df[self.df.columns[1]].values if len(X) > 10000: logger.info("Computing 2D histogram. Please wait") pylab.clf() if norm == 'log': from matplotlib import colors res = pylab.hist2d(X, Y, bins=bins, density=normed, cmap=cmap, norm=colors.LogNorm()) else: res = pylab.hist2d(X, Y, bins=bins, cmap=cmap, density=normed, range=range) if colorbar is True: pylab.colorbar() if contour: try: bins1 = bins[0] bins2 = bins[1] except: bins1 = bins bins2 = bins X, Y = pylab.meshgrid(res[1][0:bins1], res[2][0:bins2]) if contour: if res[0].max().max() < 10 and norm == 'log': pylab.contour(X, Y, res[0].transpose()) else: levels = [ round(x) for x in pylab.logspace( 0, pylab.log10(res[0].max().max()), Nlevels) ] pylab.contour(X, Y, res[0].transpose(), levels[2:]) #pylab.clabel(C, fontsize=fontsize, inline=1) if ylabel is None: ylabel = self.df.columns[1] if xlabel is None: xlabel = self.df.columns[0] pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) return res
return genome_not_covered ################################ PLOT ############################################################################################## if do_plot: cmap = pylab.cm.get_cmap(colormap) # shuffle colors : in case 2 adjacent contigs have the same color, user can plot again to see better shuffle_col = list(np.linspace(0,1,res_best.shape[0])) shuffle(shuffle_col) colors = [cmap(i) for i in shuffle_col] pylab.plot(res_best["qLength"], res_best["score_norm"],"bo",alpha=0.5) pylab.xlabel("Length of contig") pylab.ylabel("Score blasr (normalised by length)") pylab.title(title_plot) if save_plot: pylab.savefig(file_plot.replace(".png","_scores.png")) else: pylab.show() fig, axarr = pylab.subplots(2,figsize=figsize, sharex=True) fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10) # plot coverage found by blasr, with score ax = axarr[0] list_contigs = plot_contigs(res_best, ax, mode="score") genome_not_covered = areas_not_covered(list_contigs, len_genome) # add grey on not covered areas for area in genome_not_covered: