Beispiel #1
0
def get_bam_stats(filename):
    from sequana import BAM
    import pandas as pd
    bam = BAM(filename)
    stats = bam.get_stats()
    df = pd.Series(stats).to_frame().T
    return df
Beispiel #2
0
class LAA_Assembly():
    """

    Input is a SAM/BAM from the mapping of amplicon onto a known reference.
    Based on the position, we can construct the new reference.

    """
    def __init__(self, filename):
        self.bam = BAM(filename)

    def build_reference(self):
        self.bam.reset()
        # scan BAM file assuming it is small
        aa = [a for a in self.bam]

        # retrieve data of interest
        data = [(a.pos, {
            "name": a.query_name,
            "sequence": a.query_sequence,
            "cigar": a.cigarstring,
            "position": a.pos,
            "qstart": a.qstart,
            "qend": a.qend
        }) for a in aa]

        # sort by starting position
        data.sort(key=lambda x: x[0])

        for i, read in enumerate(data):
            read = read[1]
            if i == 0:
                sequence = read["sequence"]  # 2 is query_sequence
            else:
                pr = data[i - 1][1]  # previous read
                L = len(pr["sequence"])
                end_position_pr = pr['position'] - pr['qstart'] + L

                # overlap between previous read and this one
                overlap = end_position_pr - (read['position'] -
                                             read['qstart']) + 0
                print(overlap)
                print(pr['position'], pr['qstart'], L, end_position_pr)
                print(read['position'], read['qstart'])
                sequence = sequence + read["sequence"][overlap + 1:]

        # argmax([sum(a==b for a,b in zip(X[-i:] , Y[:i]))/float(i+1) for i in range(1000)])
        return sequence

    def save_fasta(self, filename, sequence=None):
        if sequence is None:
            sequence = self.build_reference()

        with open(filename, "w") as fout:
            fout.write(">test\n{}".format(sequence))
Beispiel #3
0
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, 
    local_th=5, global_th=10):
    """

    If at least 10 position contains at least 5 instances of the motif, then
    this is a hit and the alignment is kept
    """
    b1 = BAM(bamfile)

    # FIND motif and create pictures
    count = 0
    found = []
    Ss = []
    alns = []
    for a in b1:
        count +=1
        if a.query_sequence is None:
            continue
        seq = a.query_sequence
        X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
        S = sum([x>local_th for x in X1])
        Ss.append(S)
        als.append(a)
        if S > global_th:
            found.append(True)
            off = a.query_alignment_start
            pylab.clf()
            pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1)
            if savefig:
                pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_")))
        else:
            found.append(False)

    return alns, found, Ss
Beispiel #4
0
    def plot_specific_alignment(self,
                                query_name,
                                motif,
                                clf=True,
                                windows=[10, 50, 100, 200, 500, 1000]):

        found = None
        bam = BAM(self.bamfile)
        for aln in bam:
            if aln.query_name == query_name:
                found = aln
        if found:
            # Detection
            seq = found.query_sequence
            if clf: pylab.clf()
            for window in windows:
                X = [seq[i:i + window].count(motif) for i in range(len(seq))]
                pylab.plot(X, label=window)
                score = sum([x > window / 6 for x in X])
                print(window, score / 3.)
            pylab.legend()
            pylab.ylabel("# {} in a given sliding window".format(motif))
            pylab.title(query_name)
        else:
            print("Not found")
Beispiel #5
0
    def plot_specific_alignment(self, bamfile, query_name, motif,clf=True,
            show_figure=True, authorized_flags=[0,16],
            windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5):

        found = None
        bam = BAM(bamfile)
        for aln in bam:
            if aln.query_name == query_name and aln.flag in authorized_flags:
                found = aln
                break  # we may have several entries. let us pick up the first 
            

        sizes = []
        if found:
            # Detection
            seq = found.query_sequence
            if clf:pylab.clf()
            for window in windows:
                X = [seq[i:i+window].count(motif) for i in range(len(seq))]
                if show_figure:
                    pylab.plot(X, label=window)
                score = sum([x>local_threshold for x in X])
                sizes.append(score-window)
            if show_figure:
                pylab.legend()
                pylab.ylabel("# {} in a given sliding window".format(motif))
                pylab.title(query_name)
        else:
            print("{} Not found in {} file".format(query_name, bamfile))
        
        return sizes
Beispiel #6
0
    def __init__(self, filename, reference=None, bamfile=None, mode="canu"):
        """


            minimap2 -x map-pb reference filename -a > temp.sam
            bioconvert sam2bam temp.sam temp.bam

        """
        super(Contigs, self).__init__(filename)
        self.mode = mode
        self._df = None
        if bamfile:
            self.bam = BAM(bamfile)
        else:
            self.bam = None
        self.reference = reference
Beispiel #7
0
    def plot_alignment(self, bamfile, motif, window=200,
            global_th=10,title=None,legend=True, legend_fontsize=11,
            valid_rnames=[],
            valid_flags=[]):
        """


        plot alignments that match the motif. 

        """

        bam = BAM(bamfile)
        print("Found {} hits".format(len(bam)))
        pylab.clf()
        count = 0
        for aln in bam:
            if valid_rnames and aln.rname not in valid_rnames:
                continue
            if valid_flags and aln.flag not in valid_flags:
                continue

            seq = aln.query_sequence
            if seq:
                count += 1
                X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
                pylab.plot(range(aln.reference_start,
                    aln.reference_start+len(seq)),X1, label=aln.query_name)
        print("Showing {} entries after filtering".format(count))
        max_theo = int(1.2*window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count<15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)
Beispiel #8
0
    def _computation(self):
        self.bam = BAM(self.bam_input)

        results = {}
        results['alignment_count'] = len(self.bam)

        # first, we store the flags
        df = self.bam.get_flags_as_df().sum()
        df = df.to_frame()
        df.columns = ['counter']
        sf = SAMFlags()
        df['meaning'] = sf.get_meaning()
        df = df[['meaning', 'counter']]
        results['flags'] = df

        return results

        self.bam.plot_bar_flags(logy=False,
                                filename=self.directory + os.sep +
                                "bar_flags.png")
        self.bam.plot_bar_mapq(filename=self.directory + os.sep +
                               "bar_mapq.png")
Beispiel #9
0
def bam_get_paired_distance(filename):
    """Return distance between 2 mated-reads

    :return: list of tuples where each tuple contains the position start,
        position end of the paired-end reads that were mapped + the mode.
        mode =1 means fragment is reversed. mode = 2 means mate is reversed.
        mode = 3 means none are reversed.

    ::

        distances = bam_get_paired_distance(bamfile)
        hist([x[1]-x[0] for x in distances])

    .. warning:: experimental
    """

    b = BAM(filename)
    distances = []

    for fragment in b:
        if fragment.is_unmapped is False and fragment.mate_is_unmapped is False \
            and fragment.is_read1:

            # get the mate:
            mate = next(b)


            if fragment.is_reverse:
                position2 = fragment.reference_end
                position1 = mate.reference_start
                mode = 1
            elif mate.is_reverse:
                position1 = fragment.reference_start
                position2 = mate.reference_end
                mode = 2
            else: # if both are not reversed, what does that mean.
                # On Hm2, this is the case for 4 pairs out of 1622
                # This seems to be a special case for fragment ends exactly
                # at the end of the reference and mate starts exactly at
                # the beginnin with a length less than 100
                print(fragment.reference_start, fragment.reference_end)
                print(mate.reference_start, mate.reference_end)
                position1 = -1
                position2 = -1
                mode = 3

            distances.append((position1, position2, mode))

    return distances
Beispiel #10
0
def sniff(filename):
    logger.info("Sniffing file {}".format(filename))
    from sequana import BAM, SAM, CRAM
    from sequana.sniffer import sniffer
    datatype = sniffer(filename)
    if datatype == "SAM":
        logger.info("Input data in SAM format")
        data = SAM(filename)
    elif datatype == "BAM":
        logger.info("Input data in BAM format")
        data = BAM(filename)
    elif datatype == "CRAM":
        logger.info("Input data in CRAM format")
        data = CRAM(filename)
    else:
        raise ValueError(
            "Your input file does not seem to be a valid SAM/BAM/CRAM file")
    return data
Beispiel #11
0
    def find_motif(self, motif, window=200, figure=False, savefig=False):

        b1 = BAM(self.bamfile)

        df = {
            "query_name": [],
            "hit": [],
            "length": [],
            "start": [],
            "end": []
        }

        for a in b1:
            if a.query_sequence is None:
                continue
            seq = a.query_sequence

            X1 = [seq[i:i + window].count(motif) for i in range(len(seq))]
            S = sum([x >= self.local_threshold for x in X1])

            df['query_name'].append(a.query_name)
            df['start'].append(a.reference_start)
            df['end'].append(a.reference_end)
            df['length'].append(a.rlen)
            df['hit'].append(S)

            if S >= self.global_threshold:
                off = a.query_alignment_start
                #pylab.clf()
                if figure:
                    pylab.plot(
                        range(off + a.reference_start,
                              off + a.reference_start + len(seq)), X1)
                    if savefig:
                        pylab.savefig("{}_{}_{}.png".format(
                            a.reference_name, S,
                            a.query_name.replace("/", "_")))

        df = pd.DataFrame(df)
        L = len(df.query("hit>5"))
        print(L)
        return df
Beispiel #12
0
    def _computation(self):
        self.bam = BAM(self.bam_input)

        results = {}
        results['alignment_count'] = len(self.bam)

        # first, we store the flags
        df = self.bam.get_flags_as_df().sum()
        df = df.to_frame()
        df.columns = ['counter']
        sf = SAMFlags()
        df['meaning'] = sf.get_meaning()
        df = df[['meaning', 'counter']]
        results['flags'] = df

        return results

        self.bam.plot_bar_flags(logy=False, filename=self.directory + os.sep +
                                                     "bar_flags.png")
        self.bam.plot_bar_mapq(filename=self.directory + os.sep + "bar_mapq.png")
Beispiel #13
0
    def plot_alignment(self,
                       motif,
                       window=200,
                       global_th=10,
                       title=None,
                       legend=True,
                       legend_fontsize=11):
        """


        plot alignments that match the motif. 

        """
        df = self._get_aligments(motif=motif,
                                 window=window,
                                 global_th=global_th)
        print("Found {} hits".format(len(df)))
        bam = BAM(self.bamfile)
        pylab.clf()
        count = 0
        for aln in bam:
            if aln.query_name in df.query_name.values:
                seq = aln.query_sequence
                if seq:
                    count += 1
                    X1 = [
                        seq[i:i + window].count(motif) for i in range(len(seq))
                    ]
                    pylab.plot(range(aln.reference_start,
                                     aln.reference_start + len(seq)),
                               X1,
                               label=aln.query_name)

        max_theo = int(1.2 * window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count < 15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)

        return df
Beispiel #14
0
    def find_motif_bam(self, filename, motif, window=200, figure=False, savefig=False,
            local_threshold=None, global_threshold=None):
        from sequana import BAM
        b1 = BAM(filename)
        df = {
            "query_name": [],
            "hit": [],
            "length": [],
            "start": [],
            "end": []
        }

        for a in b1:
            if a.query_sequence is None:
                continue
            seq = a.query_sequence

            X1, S = self.find_motif_from_sequence(seq, motif, window=window,
                local_threshold=local_threshold)

            df['query_name'].append(a.query_name)
            df['start'].append(a.reference_start)
            df['end'].append(a.reference_end)
            df['length'].append(a.rlen)
            df['hit'].append(S)

            if S >= self.global_threshold:
                off = a.query_alignment_start
                #pylab.clf()
                if figure:
                    pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1)
                    if savefig:
                        pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_")))

        df = pd.DataFrame(df)
        L = len(df.query("hit>5"))
        print(L)
        return df
Beispiel #15
0
class Contigs(ContigsBase):
    def __init__(self, filename, reference=None, bamfile=None, mode="canu"):
        """


            minimap2 -x map-pb reference filename -a > temp.sam
            bioconvert sam2bam temp.sam temp.bam

        """
        super(Contigs, self).__init__(filename)
        self.mode = mode
        self._df = None
        if bamfile:
            self.bam = BAM(bamfile)
        else:
            self.bam = None
        self.reference = reference

    def bar_plot_contigs_length(self):
        # show length of N contigs as compare to length of the reference
        fref = FastA(self.reference)
        Nref = len(fref.sequences)
        N = len(self.fasta)
        pylab.clf()
        pylab.bar(range(0, N, int(pylab.ceil(N / Nref))),
                  sorted(fref.lengths),
                  width=Nref / 1.1,
                  label="Plasmodium chromosomes")
        pylab.bar(range(0, N),
                  sorted(self.fasta.lengths),
                  width=1,
                  label="canu {} contigs".format(N))
        pylab.legend()
        #pylab.savefig("1179_195_contigs.png", dpi=200)

    def hist_plot_contig_length(self, bins=40, fontsize=16):
        """Plot distribution of contig lengths"""
        L = len(self.fasta.sequences)
        pylab.hist(self.fasta.lengths, lw=1, ec="k", bins=bins)
        pylab.grid()
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("#", fontsize=fontsize)
        pylab.title("Distribution {} contigs".format(L))

    def get_df(self, window=100):
        print("building GC content")
        data = tools._base_content(self.filename, window, "GC")
        names = self.fasta.names
        lengths = self.fasta.lengths
        GC = [np.nanmean(data[name]) for name in names]
        nreads = [0] * len(GC)
        covStats = [0] * len(GC)
        if self.mode == "canu":
            for i, comment in enumerate(self.fasta.comments):
                read = [x for x in comment.split() if x.startswith("reads")][0]
                covStat = [
                    x for x in comment.split() if x.startswith("covStat")
                ][0]
                read = read.split("=")[1]
                covStat = covStat.split("=")[1]
                nreads[i] = int(read)
                covStats[i] = float(covStat)
        #if self.bamfile
        df = pd.DataFrame({
            "GC": list(GC),
            "length": lengths,
            "name": names,
            "nread": nreads,
            "covStat": covStats
        })

        # deal with the bamfile
        if self.bam:
            bam_df = self.bam.get_df()
            bam_df = bam_df.query("flag in [0,16]")
            bam_df.set_index("qname", inplace=True)
            chrom_name = bam_df.loc[self.fasta.names]["rname"]
            df["chromosome"] = list(chrom_name)

        self._df = df.copy()
        return df

    def plot_contig_length_vs_nreads(self, fontsize=16):
        # same as plot_scatter_contig_length_nread_cov
        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()
        pylab.loglog(df.length, df.nread, "o")
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig N reads", fontsize=fontsize)
        pylab.grid()

        X = df.query("nread>10 and length>100000")['length']
        Y = df.query("nread>10 and length>100000")['nread']
        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])
        pylab.plot(x, m * x + c, "o-r")
        pylab.tight_layout()

    def plot_scatter_contig_length_nread_cov(self,
                                             fontsize=16,
                                             vmin=0,
                                             vmax=50,
                                             min_nreads=20,
                                             min_length=50000):

        if self._df is None:
            _ = self.get_df()
        pylab.clf()
        df = self._df

        m1 = df.length.min()
        M1 = df.length.max()

        # least square
        X = df.query("nread>@min_nreads and length>@min_length")['length']
        Y = df.query("nread>@min_nreads and length>@min_length")['nread']
        Z = df.query("nread>@min_nreads and length>@min_length")['covStat']
        print(X)
        print(Y)
        print(Z)

        A = np.vstack([X, np.ones(len(X))]).T
        m, c = np.linalg.lstsq(A, Y.as_matrix())[0]
        x = np.array([m1, M1])

        X = df['length']
        Y = df['nread']
        Z = df['covStat']
        pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax)
        pylab.colorbar()
        pylab.xlabel("Contig length", fontsize=fontsize)
        pylab.ylabel("Contig reads", fontsize=fontsize)
        pylab.title("coverage function of contig length and reads used")
        pylab.grid()
        pylab.plot(x, m * x + c, "o-r")
        pylab.loglog()
        pylab.tight_layout()

    def get_contig_per_chromosome(self):
        if self.bam is None:
            print("no bam file found")
            return
        df = self.bam.get_df()
        df = df.query("flag in [0,16]")
        alldata = {}
        for chrom in sorted(df.rname.unique()):
            data = df.query("rname == @chrom").sort_values(by='rstart')[[
                "qname", "qlen", "rstart", "rend"
            ]]
            alldata[chrom] = data
        return alldata

    def stats(self):
        from sequana.stats import N50, L50
        length = self.get_df()['length']
        return {
            'N50': N50(length),
            'total_length': sum(length),
            'L50': L50(length)
        }

    def plot_contig_length_vs_GC(self):
        pylab.plot(self.get_df()["length"], self.get_df()['GC'], "o")
Beispiel #16
0
class BAMQCModule(SequanaBaseModule):
    """Report dedicated to BAM file

    ::

        from sequana import sequana_data
        from sequana.modules_report.bamqc import BAMQCModule
        filename = sequana_data("test.bam")

        r = BAMQCModule(filename)
        r.create_html("test.html")

        # report/bam.html is now available

    .. todo:: right now, the computation is performed in the class. Ideally,
        we would like the computation to happen elsewhere, where a json is stored. 
        The json would be the input to this class.
    """
    def __init__(self, bam_input, output_filename=None):
        super().__init__()

        self.bam_input = bam_input
        self.title = "Bam Report"
        self.create_report_content()
        self.create_html(output_filename)

    def create_report_content(self):
        self.sections = list()
        self.add_flag_section()
        self.add_images_section()

    def _computation(self):
        self.bam = BAM(self.bam_input)

        results = {}
        results['alignment_count'] = len(self.bam)

        # first, we store the flags
        df = self.bam.get_flags_as_df().sum()
        df = df.to_frame()
        df.columns = ['counter']
        sf = SAMFlags()
        df['meaning'] = sf.get_meaning()
        df = df[['meaning', 'counter']]
        results['flags'] = df

        return results

        self.bam.plot_bar_flags(logy=False,
                                filename=self.directory + os.sep +
                                "bar_flags.png")
        self.bam.plot_bar_mapq(filename=self.directory + os.sep +
                               "bar_mapq.png")

    def add_flag_section(self):
        data = self._computation()
        df = data['flags']

        datatable = DataTable(df, "flags", index=True)
        datatable.datatable.datatable_options = {
            'scrollX': '300px',
            'pageLength': 15,
            'scrollCollapse': 'true',
            'dom': 'tB',
            "paging": "false",
            'buttons': ['copy', 'csv']
        }
        js = datatable.create_javascript_function()
        html_tab = datatable.create_datatable(float_format='%.3g')

        html = ""
        html += "{} {}".format(html_tab, js)

        self.sections.append({
            "name": "Flags information",
            "anchor": "flags",
            "content": html
        })

    def add_images_section(self):
        style = "width:65%"
        import pylab
        pylab.ioff()

        def plotter1(filename):
            self.bam.plot_bar_flags(logy=True, filename=filename)

        html1 = self.create_embedded_png(plotter1, "filename", style=style)

        def plotter2(filename):
            self.bam.plot_bar_flags(logy=False, filename=filename)

        html2 = self.create_embedded_png(plotter2, "filename", style=style)

        def plotter3(filename):
            self.bam.plot_bar_mapq(filename=filename)

        html3 = self.create_embedded_png(plotter3, "filename", style=style)

        self.sections.append({
            "name": "Image",
            "anchor": "table",
            "content": html1 + html2 + html3
        })
Beispiel #17
0
"""
BAM module example
====================

Plot histogram of MAPQ values contained in a BAM file
"""
#################################################
#
from sequana import BAM, sequana_data


#####################################################
# Get a data set (BAM file) for testing
from sequana import BAM, sequana_data
datatest = sequana_data('test.bam', "testing")

####################################################
# Use :class:`sequana.bamtools.BAM` class to plot the MAPQ historgram 
b = BAM(datatest)
b.plot_bar_mapq()
Beispiel #18
0
"""
BAM module example
====================

Plot histogram of MAPQ values contained in a BAM file

"""

#################################################
# first import the relevant modules
from sequana import BAM, sequana_data

#####################################################
# Get a data set (BAM file) for testing
from sequana import BAM, sequana_data
datatest = sequana_data('test.bam', "testing")

##########################################################################
# Use :class:`sequana.bamtools.BAM` class to plot the MAPQ historgram
b = BAM(datatest)
b.plot_bar_mapq()
Beispiel #19
0
class BAMQCModule(SequanaBaseModule):
    """Report dedicated to BAM file

    ::

        from sequana import sequana_data
        from sequana.modules_report.bamqc import BAMQCModule
        filename = sequana_data("test.bam")

        r = BAMQCModule(filename)
        r.create_html("test.html")

        # report/bam.html is now available

    .. todo:: right now, the computation is performed in the class. Ideally,
        we would like the computation to happen elsewhere, where a json is stored. 
        The json would be the input to this class.
    """
    def __init__(self, bam_input, output_filename=None):
        super().__init__()

        self.bam_input = bam_input
        self.title = "Bam Report"
        self.create_report_content()
        self.create_html(output_filename)

    def create_report_content(self):
        self.sections = list()
        self.add_flag_section()
        self.add_images_section()

    def _computation(self):
        self.bam = BAM(self.bam_input)

        results = {}
        results['alignment_count'] = len(self.bam)

        # first, we store the flags
        df = self.bam.get_flags_as_df().sum()
        df = df.to_frame()
        df.columns = ['counter']
        sf = SAMFlags()
        df['meaning'] = sf.get_meaning()
        df = df[['meaning', 'counter']]
        results['flags'] = df

        return results

        self.bam.plot_bar_flags(logy=False, filename=self.directory + os.sep +
                                                     "bar_flags.png")
        self.bam.plot_bar_mapq(filename=self.directory + os.sep + "bar_mapq.png")

    def add_flag_section(self):
        data = self._computation()
        df = data['flags']

        datatable = DataTable(df, "flags", index=True)
        datatable.datatable.datatable_options = {
            'scrollX': '300px',
            'pageLength': 15,
            'scrollCollapse': 'true',
            'dom': 'tB',
            "paging": "false",
            'buttons': ['copy', 'csv']}
        js = datatable.create_javascript_function()
        html_tab = datatable.create_datatable(float_format='%.3g')

        html = ""
        html += "{} {}".format(html_tab, js)

        self.sections.append({
          "name": "Flags information",
          "anchor": "flags",
          "content": html
        })

    def add_images_section(self):
        style = "width:65%"
        import pylab
        pylab.ioff()

        def plotter1(filename):
            self.bam.plot_bar_flags(logy=True, filename=filename)
        html1 = self.create_embedded_png(plotter1, "filename", style=style)

        def plotter2(filename):
            self.bam.plot_bar_flags(logy=False, filename=filename)
        html2 = self.create_embedded_png(plotter2, "filename", style=style)

        def plotter3(filename):
            self.bam.plot_bar_mapq(filename=filename)
        html3 = self.create_embedded_png(plotter3, "filename", style=style)


        self.sections.append({
          "name": "Image",
          "anchor": "table",
          "content": html1 + html2 + html3
        })
Beispiel #20
0
def summary(**kwargs):
    """Create a HTML report for various type of NGS formats.

    \b
    * bamqc
    * fastq

    This will process all files in the given pattern (in back quotes)
    sequentially and procude one HTML file per input file.


    Other module all work in the same way. For example, for FastQ files::

        sequana summary one_input.fastq
        sequana summary `ls *fastq` 


    """
    names = kwargs['name']
    module = kwargs['module']

    if module is None:
        if names[0].endswith('fastq.gz') or names[0].endswith('.fastq'):
            module = "fastq"
        elif names[0].endswith('.bam'):
            module = "bam"
        elif names[0].endswith('.gff') or names[0].endswith('gff3'):
            module = "gff"
        elif names[0].endswith('fasta.gz') or names[0].endswith('.fasta'):
            module = "fasta"
        else:
            logger.error(
                "please use --module to tell us about the input fimes")
            sys.exit(1)

    if module == "bamqc":
        for name in names:
            print(f"Processing {name}")
            from sequana.modules_report.bamqc import BAMQCModule
            report = BAMQCModule(name, "bamqc.html")
    elif module == "fasta":  # there is no module per se. HEre we just call FastA.summary()
        from sequana.fasta import FastA
        for name in names:
            f = FastA(name)
            f.summary()
    elif module == "fastq":  # there is no module per se. HEre we just call FastA.summary()
        from sequana.fastq import FastQ
        from sequana import FastQC
        for filename in names:
            ff = FastQC(filename, max_sample=1e6, verbose=False)
            stats = ff.get_stats()
            print(stats)
    elif module == "bam":
        import pandas as pd
        from sequana import BAM
        for filename in names:
            ff = BAM(filename)
            stats = ff.get_stats()
            df = pd.Series(stats).to_frame().T
            print(df)
    elif module == "gff":
        import pandas as pd
        from sequana import GFF3
        for filename in names:
            ff = GFF3(filename)
            print("#filename: {}".format(filename))
            print("#Number of entries per genetic type:")
            print(ff.df.value_counts('type').to_string())
            print("#Number of duplicated attribute (if any) per attribute:")
            ff.get_duplicated_attributes_per_type()
Beispiel #21
0
def bam_to_mapped_unmapped_fastq(filename,
                                 output_directory=None,
                                 verbose=True):
    """Create mapped and unmapped fastq files from a BAM file

    :context: given a reference, one or two FastQ files are mapped onto the
        reference to generate a BAM file. This BAM file is a compressed version
        of a SAM file, which interpretation should be eased within this
        function.

    :param filename: input BAM file
    :param output_directory: where to save the mapped and unmapped files
    :return: dictionary with number of reads for each file (mapped/unmapped for
        R1/R2) as well as the mode (paired or not), the number of unpaired
        reads, and the number of duplicated reads. The unpaired reads should
        be zero (sanity check)

    Given a BAM file, create FASTQ with R1/R2 reads mapped and unmapped.
    In the paired-end case, 4 files are created.

    Note that this function is efficient in that it does not create intermediate
    files limiting IO in the process. As compared to standard tools such as 
    bedtools bamtofastq, it is 1.5 to 2X slower but it does create the mapped
    AND unmapped reads.

    :Details: Secondary alignment (flag 256) are dropped so as to remove any
        ambiguous alignments. The output dictionary stores "secondary" key to
        keep track of the total number of secondary reads that are dropped. If
        the flag is 256 and the read is unpaired, the key *unpaired* is also
        incremented.

        If the flag is not equal to 256, we first reverse complement reads that
        are tagged as *reverse* in the BAM file. Then, reads that are not paired or
        not "proper pair" (neither flag 4 nor flag 8) are ignored.

        If R1 is mapped **or** R2 is mapped then the reads are considered mapped. If
        both R1 and R2 are unmapped, then reads are unmapped.

    .. note:: about chimeric alignment: one is the representative and the other is
        the supplementary. This flag is not used in this function. Note also that
        chimeric alignment have same QNAME and flag 4 and 8

    .. note:: the contamination reported is basde on R1 only.

    .. todo:: comments are missing since there are not stored in the BAM file.


    .. note:: the mapped reads may not be synchronized because we include also
        the chimeric alignment (cf samtools documentation). However, 
        total reads = unmappeds reads + R1 mapped + R2 mapped - supplementary
        reads (those with flag 2048).
    """
    bam = BAM(filename)
    # figure out if this is paired or unpaired

    newname, ext = os.path.splitext(filename)

    import collections
    stats = collections.defaultdict(int)
    stats['R1_unmapped'] = 0
    stats['R1_mapped'] = 0

    # figure out where to save the file
    if output_directory is None:
        pass
    else:
        assert isinstance(filename, str)
        from sequana.snaketools import FileFactory
        ff = FileFactory(filename)
        newname = output_directory + os.sep + ff.filenames[0]

    rt1 = "_R1_"
    rt2 = "_R2_"

    R1_mapped = open(newname + "{}.mapped.fastq".format(rt1), "wb")
    R1_unmapped = open(newname + "{}.unmapped.fastq".format(rt1), "wb")
    stats['duplicated'] = 0
    stats['unpaired'] = 0

    unpaired = 0

    # if paired, let open other files
    if bam.is_paired:
        stats['mode'] = "pe"
        stats['R2_unmapped'] = 0
        stats['R2_mapped'] = 0
        R2_mapped = open(newname + "{}.mapped.fastq".format(rt2), "wb")
        R2_unmapped = open(newname + "{}.unmapped.fastq".format(rt2), "wb")
    else:
        stats['mode'] = "se"

    # loop through the BAM (make sure it is rewinded)
    bam.reset()

    if verbose:
        from easydev import Progress
        pb = Progress(len(bam))

    for i, this in enumerate(bam):
        if this.flag & 256:
            # Unmapped reads are in the BAM file but have no valid assigned
            # position (N.B., they may have an assigned position, but it should be ignored).
            # It's typically the case that a number of reads can't be aligned, due to things
            # like sequencing errors, imperfect matches between the DNA sequenced and the
            # reference, random e. coli or other contamination, etc..
            # A secondary alignment occurs when a given read could align reasonably well to
            # more than one place. One of the possible reported alignments is termed "primary"
            # and the others will be marked as "secondary".
            stats['secondary'] += 1
            if this.is_paired is False:
                stats['unpaired'] += 1
        else:

            # quick hack
            if this.is_read1:
                suffix = b"/1"
            else:
                suffix = b"/2"

            # in pysam, seq is a string and qual a bytes....
            if this.is_reverse is True:
                txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n"
                revcomp = reverse_complement(this.seq)
                txt += bytes(revcomp, "utf-8") + b"\n"
                txt += b"+\n"
                txt += bytes(this.qual[::-1], 'utf-8') + b"\n"
            else:
                txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n"
                txt += bytes(this.seq, "utf-8") + b"\n"
                txt += b"+\n"
                txt += bytes(this.qual, "utf-8") + b"\n"

            # Here, we must be careful as to keep the pairs. So if R1 is mapped
            # but R2 is unmapped (or the inverse), then the pair is mapped
            if this.is_read1:
                if this.is_unmapped and this.mate_is_unmapped:
                    R1_unmapped.write(txt)
                    stats['R1_unmapped'] += 1
                else:
                    R1_mapped.write(txt)
                    stats['R1_mapped'] += 1
            elif this.is_read2:
                if this.is_unmapped and this.mate_is_unmapped:
                    R2_unmapped.write(txt)
                    stats['R2_unmapped'] += 1
                else:
                    R2_mapped.write(txt)
                    stats['R2_mapped'] += 1
            else:
                # This should be a single read
                #assert self.is_paired is False
                stats['unpaired'] += 1
                if this.is_unmapped:
                    R1_unmapped.write(txt)
                    stats['R1_unmapped'] += 1
                else:
                    R1_mapped.write(txt)
                    stats['R1_mapped'] += 1

            if this.is_duplicate:
                stats['duplicated'] += 1

        if verbose:
            pb.animate(i + 1)

    if bam.is_paired:
        R2_mapped.close()
        R2_unmapped.close()

    logger.info("\nNumber of entries in the BAM: %s" % str(i + 1))

    R1_mapped.close()
    R1_unmapped.close()

    _x = stats['R1_mapped']
    _y = stats['R1_unmapped']
    stats["contamination"] = _x / float(_x + _y) * 100

    return stats
Beispiel #22
0
 def __init__(self, filename):
     self.bam = BAM(filename)
Beispiel #23
0
def bam_to_mapped_unmapped_fastq(filename, output_directory=None, verbose=True):
    """Create mapped and unmapped fastq files from a BAM file

    :context: given a reference, one or two FastQ files are mapped onto the
        reference to generate a BAM file. This BAM file is a compressed version
        of a SAM file, which interpretation should be eased within this
        function.

    :param filename: input BAM file
    :param output_directory: where to save the mapped and unmapped files
    :return: dictionary with number of reads for each file (mapped/unmapped for
        R1/R2) as well as the mode (paired or not), the number of unpaired
        reads, and the number of duplicated reads. The unpaired reads should
        be zero (sanity check)

    Given a BAM file, create FASTQ with R1/R2 reads mapped and unmapped.
    In the paired-end case, 4 files are created.

    Note that this function is efficient in that it does not create intermediate
    files limiting IO in the process. As compared to standard tools such as 
    bedtools bamtofastq, it is 1.5 to 2X slower but it does create the mapped
    AND unmapped reads.

    :Details: Secondary alignment (flag 256) are dropped so as to remove any
        ambiguous alignments. The output dictionary stores "secondary" key to
        keep track of the total number of secondary reads that are dropped. If
        the flag is 256 and the read is unpaired, the key *unpaired* is also
        incremented.

        If the flag is not equal to 256, we first reverse complement reads that
        are tagged as *reverse* in the BAM file. Then, reads that are not paired or
        not "proper pair" (neither flag 4 nor flag 8) are ignored.

        If R1 is mapped **or** R2 is mapped then the reads are considered mapped. If
        both R1 and R2 are unmapped, then reads are unmapped.

    .. note:: about chimeric alignment: one is the representative and the other is
        the supplementary. This flag is not used in this function. Note also that
        chimeric alignment have same QNAME and flag 4 and 8

    .. note:: the contamination reported is basde on R1 only.

    .. todo:: comments are missing since there are not stored in the BAM file.


    .. note:: the mapped reads may not be synchronized because we include also
        the chimeric alignment (cf samtools documentation). However, 
        total reads = unmappeds reads + R1 mapped + R2 mapped - supplementary
        reads (those with flag 2048).
    """
    bam = BAM(filename)
    # figure out if this is paired or unpaired

    newname, ext = os.path.splitext(filename)

    import collections
    stats = collections.defaultdict(int)
    stats['R1_unmapped'] = 0
    stats['R1_mapped'] = 0

    # figure out where to save the file
    if output_directory is None:
        pass
    else:
        assert isinstance(filename, str)
        from sequana.snaketools import FileFactory
        ff = FileFactory(filename)
        newname = output_directory + os.sep + ff.filenames[0]

    rt1 = "_R1_"
    rt2 = "_R2_"

    R1_mapped = open(newname + "{}.mapped.fastq".format(rt1), "wb")
    R1_unmapped = open(newname + "{}.unmapped.fastq".format(rt1), "wb")
    stats['duplicated'] = 0
    stats['unpaired'] = 0

    unpaired = 0

    # if paired, let open other files
    if bam.is_paired:
        stats['mode'] = "pe"
        stats['R2_unmapped'] = 0
        stats['R2_mapped'] = 0
        R2_mapped = open(newname + "{}.mapped.fastq".format(rt2), "wb")
        R2_unmapped = open(newname + "{}.unmapped.fastq".format(rt2), "wb")
    else:
        stats['mode'] = "se"

    # loop through the BAM (make sure it is rewinded)
    bam.reset()

    if verbose:
        from easydev import Progress
        pb = Progress(len(bam))

    for i, this in enumerate(bam):
        if this.flag & 256:
            # Unmapped reads are in the BAM file but have no valid assigned
            # position (N.B., they may have an assigned position, but it should be ignored).
            # It's typically the case that a number of reads can't be aligned, due to things
            # like sequencing errors, imperfect matches between the DNA sequenced and the
            # reference, random e. coli or other contamination, etc..
            # A secondary alignment occurs when a given read could align reasonably well to
            # more than one place. One of the possible reported alignments is termed "primary"
            # and the others will be marked as "secondary".
            stats['secondary'] += 1
            if this.is_paired is False:
                stats['unpaired'] += 1
        else:

            # quick hack
            if this.is_read1:
                suffix = b"/1"
            else:
                suffix = b"/2"

            # in pysam, seq is a string and qual a bytes....
            if this.is_reverse is True:
                txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n"
                revcomp = reverse_complement(this.seq)
                txt += bytes(revcomp, "utf-8") + b"\n"
                txt += b"+\n"
                txt += bytes(this.qual[::-1], 'utf-8') + b"\n"
            else:
                txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n"
                txt += bytes(this.seq, "utf-8") + b"\n"
                txt += b"+\n"
                txt += bytes(this.qual,"utf-8") + b"\n"

            # Here, we must be careful as to keep the pairs. So if R1 is mapped
            # but R2 is unmapped (or the inverse), then the pair is mapped
            if this.is_read1:
                if this.is_unmapped and this.mate_is_unmapped:
                    R1_unmapped.write(txt)
                    stats['R1_unmapped'] += 1
                else:
                    R1_mapped.write(txt)
                    stats['R1_mapped'] += 1
            elif this.is_read2:
                if this.is_unmapped and this.mate_is_unmapped:
                    R2_unmapped.write(txt)
                    stats['R2_unmapped'] += 1
                else:
                    R2_mapped.write(txt)
                    stats['R2_mapped'] += 1
            else:
                # This should be a single read
                #assert self.is_paired is False
                stats['unpaired'] += 1
                if this.is_unmapped:
                    R1_unmapped.write(txt)
                    stats['R1_unmapped'] += 1
                else:
                    R1_mapped.write(txt)
                    stats['R1_mapped'] += 1

            if this.is_duplicate:
                stats['duplicated'] += 1

        if verbose:
            pb.animate(i+1)

    if bam.is_paired:
        R2_mapped.close()
        R2_unmapped.close()

    if verbose:
        print("\nNumber of entries in the BAM: %s" % str(i+1))

    R1_mapped.close()
    R1_unmapped.close()

    _x = stats['R1_mapped']
    _y = stats['R1_unmapped']
    stats["contamination"] = _x / float(_x + _y) * 100

    return stats