Beispiel #1
0
    def venn(self, compa_list, direction="all", prefix=""):
        """
        Plot a venn diagram comparing the list compa_list of dr gene lists.
        compa_list is a list of comparison names from Deseq2 results
        direction specifies either if up/down/all dr genes are considered
        prefix is a string to be added as prefix to the outfile name.

        compa_list can be a list of lists of comparisons to make.
        ie [["WT", "KO1"],["WT", "KO2"]
        """
        from sequana.viz.venn import plot_venn
        # If compa_list is a list of lists of comparison
        if all(isinstance(l, list) for l in compa_list):

            fig, ax = pylab.subplots(6, 1, figsize=(6, 20))
            ax = ax.flat

            for i, c in enumerate(compa_list):

                plot_venn(
                    [self.dr_gene_lists[x][direction] for x in c],
                    [compa_name for compa_name in c],
                    ax=ax[i],
                )
        # If compa is only a list of comparisons
        else:
            plot_venn(
                [self.dr_gene_lists[x][direction] for x in compa_list],
                [compa_name for compa_name in compa_list],
            )
        out_dir = os.path.join(self.out_dir, "vennDiagrams")
        os.makedirs(out_dir, exist_ok=True)
        outfile = os.path.join(out_dir, f"{prefix}vennDiagrams_{direction}.pdf")

        pylab.savefig(outfile, bbox_inches="tight")
Beispiel #2
0
    def plot_indel_dist(self, fontsize=16):
        """Plot indel count (+ ratio)

        :Return: list of insertions, deletions and ratio insertion/deletion for
            different length starting at 1

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_indel_dist()

        What you see on this figure is the presence of 10 insertions of length
        1, 1 insertion of length 2 and 3 deletions of length 1


        # Note that in samtools, several insertions or deletions in a single
        alignment are ignored and only the first one seems to be reported. For
        instance 10M1I10M1I stored only 1 insertion in its report; Same comment
        for deletions.

        .. todo:: speed up and handle long reads cases more effitiently by 
            storing INDELS as histograms rather than lists
        """
        try:
            self.insertions
        except:
            self._set_indels()

        if len(self.insertions) ==0 or len(self.deletions) == 0:
            raise ValueError("No deletions or insertions found")

        N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1
        D = [self.deletions.count(i) for i in range(N)]
        I = [self.insertions.count(i) for i in range(N)]
        R = [i/d if d!=0 else 0 for i,d in zip(I, D)]
        fig, ax = pylab.subplots()
        ax.plot(range(N), I, marker="x", label="Insertions")
        ax.plot(range(N), D, marker="x", label="Deletions")
        ax.plot(range(N), R, "--r", label="Ratio insertions/deletions")
        ax.set_yscale("symlog")
        pylab.ylim([1, pylab.ylim()[1]])
        pylab.legend()
        pylab.grid()
        from matplotlib.ticker import MaxNLocator
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        pylab.xlabel("Indel length", fontsize=fontsize)
        pylab.ylabel("Indel count", fontsize=fontsize)
        return I, D, R
Beispiel #3
0
 def plot_rank_vs_idr_score(self, filename=None, savefig=False):
     # rank versus IDR scores
     f, axes = pylab.subplots(2, 1)
     df = self.df
     axes[0].plot(
         range(len(df)),
         df.sort_values(by='rep1_rank', ascending=False)['local_idr'], 'o')
     axes[0].set_ylabel("log10 IDR for replicate 1")
     axes[0].axvline(len(self.df) - self.N_significant_peaks,
                     color='b',
                     ls='--')
     axes[1].plot(
         range(len(df)),
         df.sort_values(by='rep2_rank', ascending=False)['local_idr'], 'ro')
     axes[1].set_ylabel("log10 IDR for replicate 2")
     axes[1].axvline(len(self.df) - self.N_significant_peaks,
                     color='b',
                     ls='--')
     if savefig:
         pylab.savefig(filename)
Beispiel #4
0
    def plot_all_skews(self, figsize=(10, 12), fontsize=16, alpha=0.5):
        if self._window is None:
            raise AttributeError("Please set a valid window to compute skew")

        # create figure
        # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize)
        fig, axarr = pylab.subplots(9, 1, sharex=True, figsize=figsize)

        main_title = "Window size = %d (%.0f %% of genome )\n\
        GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \
        % (self._window, self._window*100/self.__len__(),
            self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100)

        pylab.suptitle(main_title, fontsize=fontsize)

        # GC skew
        axarr[0].set_title("GC skew (blue) - Cumulative sum (red)")
        axarr[0].plot(list(self._GC_skew_slide[0]), 'b-', alpha=alpha)
        axarr[0].set_ylabel("(G -C) / (G + C)")

        axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])),
                      'r-',
                      alpha=alpha)
        axarr[1].set_ylabel("(G -C) / (G + C)")

        # AT skew
        axarr[2].set_title("AT skew (blue) - Cumulative sum (red)")
        axarr[2].plot(list(self._AT_skew_slide[0]), 'b-', alpha=alpha)
        axarr[2].set_ylabel("(A -T) / (A + T)")

        axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])),
                      'r-',
                      alpha=alpha)
        axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0)

        # Xn
        axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)")
        axarr[4].plot(self._Xn, 'g-', alpha=alpha)
        axarr[4].set_ylabel("(A + G) - (C + T)")

        # Yn
        axarr[5].set_title("Cumulative MK skew (Amino - Keto)")
        axarr[5].plot(self._Yn, 'g-', alpha=alpha)
        axarr[5].set_ylabel("(A + C) - (G + T)")

        # Zn
        axarr[6].set_title(
            "Cumulative H-bond skew (Weak H-bond - Strong H-bond)")
        axarr[6].plot(self._Zn, 'g-', alpha=alpha)
        axarr[6].set_ylabel("(A + T) - (G + C)")

        # GC content
        axarr[7].set_title("GC content")
        axarr[7].plot(list(self._GC_content_slide[0]), 'k-', alpha=alpha)
        axarr[7].set_ylabel("GC")

        # AT content
        axarr[8].set_title("AT content")
        axarr[8].plot(list(self._AT_content_slide[0]), 'k-', alpha=alpha)
        axarr[8].set_ylabel("AT")

        # # FFT
        # axarr[9].set_title("FFT")
        # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha)
        # axarr[9].set_ylabel("FFT")

        fig.tight_layout()
        fig.subplots_adjust(top=0.88)
Beispiel #5
0
    shuffle(shuffle_col)
    colors = [cmap(i) for i in shuffle_col]

    pylab.plot(res_best["qLength"], res_best["score_norm"], "bo", alpha=0.5)
    pylab.xlabel("Length of contig")
    pylab.ylabel("Score blasr (normalised by length)")
    pylab.title(title_plot)
    if save_plot:
        pylab.savefig(file_plot.replace(".png", "_scores.png"))
    else:
        pylab.show()

    ##### Plot by reference

    ref_found = list(res_best["reference"].unique())
    fig, axarr = pylab.subplots(2 * len(ref_found),
                                figsize=(15, 8 * len(ref_found)))
    #fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10)

    for i in range(len(ref_found)):
        # keep only contigs aligned on this reference
        res_best_ref = res_best[res_best["reference"] == ref_found[i]]
        len_genome = df_genome_len.loc[df_genome_len["name"] == ref_found[i],
                                       "length"].values[0]

        # plot coverage found by blasr, with score
        ax = axarr[i * 2]
        list_contigs = plot_contigs(res_best_ref, ax, colors, mode="score")
        genome_not_covered = areas_not_covered(list_contigs, len_genome)
        # add grey on not covered areas
        for area in genome_not_covered:
            ax.axvspan(area[0], area[1], alpha=0.1, color='k')
Beispiel #6
0
list_input = [name.split('\n')[0] for name in f.readlines()]
f.close()

# list of labels
f = open(labels_input, 'r')
list_labels = [name.split('\n')[0] for name in f.readlines()]
f.close()


################################ EXECUTE ##############################################################################################




#pylab.figure(figsize=(5, 5))
fig1, ax1 = pylab.subplots(1,1, figsize=(5, 5))
fig2, ax2 = pylab.subplots(1,1, figsize=(5, 5))

res_PS = []

for i in range(len(list_input)):
	df = pd.read_csv(list_input[i])

	##### Precision without unknown taxons
	good_class_rank = df["good_classification_at_level"].sum()
	tot_class_rank = df["good_classification_at_level"].sum() + df["wrong_classification_at_level"].sum()
	tot = df["total_N_reads"].sum()
	wrong_class_above = df["wrong_classification_above_level"].sum()

	##### Precision without unknown taxons
	# some reads are classified but we dont find any info : cannot ignore them
Beispiel #7
0
#colors = [cmap(i) for i in np.linspace(0,1,len(list_analysis))]

# positions of genome
gen_pos = [[i, i + step - 1] for i in range(0, len_genome, step)]
y_pos = list(np.linspace(0, 1, len(analysis_names) + 2))

if custom_colors:
    y_col = [colors[i] for i in range(len(analysis_names))]
else:
    y_col = [cmap(i) for i in np.linspace(0, 1, len(analysis_names))]

pylab.close('all')

# create figure
fig, axarr = pylab.subplots(len(gen_pos),
                            1,
                            figsize=(int(step / 20000),
                                     int(len(gen_pos)) * 1.1))
for i in range(len(gen_pos)):
    subplot_variant_position(df_result, i, gen_pos, axarr, analysis_names,
                             y_pos, y_col, be_repeats_concat)

# add grey at the end (no genome)
ax = axarr[-1]
ax.axvspan(len_genome, gen_pos[-1][1], alpha=0.5, color='k')

#fig.subplots_adjust(bottom=0.2)
#fig.tight_layout()
pylab.subplots_adjust(hspace=hspace_subplots)
pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#pylab.legend(loc="lower left")
if file_fig != "show":
	cmap = pylab.cm.get_cmap(colormap)
	# shuffle colors :  in case 2 adjacent contigs have the same color, user can plot again to see better
	shuffle_col = list(np.linspace(0,1,res_best.shape[0]))
	shuffle(shuffle_col)
	colors = [cmap(i) for i in shuffle_col]
	
	pylab.plot(res_best["qLength"], res_best["score_norm"],"bo",alpha=0.5)
	pylab.xlabel("Length of contig")
	pylab.ylabel("Score blasr (normalised by length)")
	pylab.title(title_plot)
	if save_plot:
		pylab.savefig(file_plot.replace(".png","_scores.png"))
	else:
		pylab.show()

	fig, axarr = pylab.subplots(2,figsize=figsize, sharex=True)
	fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10)
	# plot coverage found by blasr, with score
	ax = axarr[0]
	list_contigs = plot_contigs(res_best, ax, mode="score")
	genome_not_covered = areas_not_covered(list_contigs, len_genome)
	# add grey on not covered areas
	for area in genome_not_covered:
		ax.axvspan(area[0],area[1], alpha=0.1, color='k')
	ax.set_ylabel("Score blasr (normalised by length)")

	# plot coverage found by blasr, with random y distribution (to see if there are overlaps)
	ax = axarr[1]
	list_contigs = plot_contigs(res_best, ax, mode="random")
	for area in genome_not_covered:
		ax.axvspan(area[0],area[1], alpha=0.05, color='k')
    # plot score nor normalised
    pylab.plot(res_blasr["qLength"], res_blasr["score"], "bo", alpha=0.5)
    pylab.xlabel("Length of contig")
    pylab.ylabel("Score blasr (not normalised)")
    pylab.title(title_plot)
    pylab.show()

    # plot score normalised by lenght
    pylab.plot(res_blasr["qLength"], res_blasr["score_norm"], "bo", alpha=0.5)
    pylab.xlabel("Length of contig")
    pylab.ylabel("Score blasr (normalised by length)")
    pylab.title(title_plot)
    pylab.show()

    fig, axarr = pylab.subplots(2, figsize=(15, 8), sharex=True)
    fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10)
    # plot coverage found by blasr, with score
    ax = axarr[0]
    for i in range(res_blasr.shape[0]):
        res_to_plot = res_blasr.iloc[i, :]
        contig = res_to_plot["qName"]
        start = int(res_to_plot["tStart"])
        end = int(res_to_plot["tEnd"])
        score = float(res_to_plot["score_norm"])
        ax.plot([start, end], [score] * 2,
                ls='-',
                lw=5,
                color=colors[i],
                solid_capstyle="butt")
    ax.set_ylabel("Score blasr (normalised by length)")
Beispiel #10
0
################################ IMPORT DATA ##############################################################################################

list_files = read_fof(fof_BAM)
description = pd.read_csv(file_description)

list_BAM = []
labels = []
for f in list_files:
    short_name_bam = f.split("/")[-1]
    labels.append(description[description["Filename"] == short_name_bam]
                  ["polymerase"].values[0])
    list_BAM.append(pacbio.BAMPacbio(f))

# plot read length
fig, ax = pylab.subplots(1, 1, figsize=figsize_read_len)
for i in range(len(list_BAM)):
    bam = list_BAM[i]
    bam.hist_len(hold=True, grid=False, label=labels[i], title="")
ax.legend()
fig.tight_layout()
if save_plots:
    pylab.savefig(filename_output.replace(".", "_read_len."), dpi=182)
    pylab.clf()
else:
    pylab.show()

# plot GC %
fig, ax = pylab.subplots(1, 1, figsize=figsize_GC)
for i in range(len(list_BAM)):
    bam = list_BAM[i]
Beispiel #11
0
    def plot_all_skews(self,figsize=(10, 12), fontsize=16, alpha=0.5):
        if self._window is None:
            raise AttributeError("Please set a valid window to compute skew")

        # create figure
        # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize)
        fig, axarr = pylab.subplots(9,1, sharex=True, figsize=figsize)

        main_title = "Window size = %d (%.0f %% of genome )\n\
        GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \
        % (self._window, self._window*100/self.__len__(),
            self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100)

        pylab.suptitle(main_title, fontsize=fontsize)

        # GC skew
        axarr[0].set_title("GC skew (blue) - Cumulative sum (red)")
        axarr[0].plot(list(self._GC_skew_slide[0]),'b-',alpha=alpha)
        axarr[0].set_ylabel("(G -C) / (G + C)")

        axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])),'r-',alpha=alpha)
        axarr[1].set_ylabel("(G -C) / (G + C)")

        # AT skew
        axarr[2].set_title("AT skew (blue) - Cumulative sum (red)")
        axarr[2].plot(list(self._AT_skew_slide[0]),'b-',alpha=alpha)
        axarr[2].set_ylabel("(A -T) / (A + T)")

        axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])),'r-',alpha=alpha)
        axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0)

        # Xn
        axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)")
        axarr[4].plot(self._Xn,'g-',alpha=alpha)
        axarr[4].set_ylabel("(A + G) - (C + T)")

        # Yn
        axarr[5].set_title("Cumulative MK skew (Amino - Keto)")
        axarr[5].plot(self._Yn,'g-',alpha=alpha)
        axarr[5].set_ylabel("(A + C) - (G + T)")

        # Zn
        axarr[6].set_title("Cumulative H-bond skew (Weak H-bond - Strong H-bond)")
        axarr[6].plot(self._Zn,'g-',alpha=alpha)
        axarr[6].set_ylabel("(A + T) - (G + C)")

        # GC content
        axarr[7].set_title("GC content")
        axarr[7].plot(list(self._GC_content_slide[0]),'k-',alpha=alpha)
        axarr[7].set_ylabel("GC")

        # AT content
        axarr[8].set_title("AT content")
        axarr[8].plot(list(self._AT_content_slide[0]),'k-',alpha=alpha)
        axarr[8].set_ylabel("AT")

        # # FFT
        # axarr[9].set_title("FFT")
        # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha)
        # axarr[9].set_ylabel("FFT")

        fig.tight_layout()
        fig.subplots_adjust(top=0.88)