Esempio n. 1
0
def test_pacbio_random():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))

    with TempFile() as fh:
        b.random_selection(fh.name, nreads=10)

    with TempFile() as fh:
        b.random_selection(fh.name,  expected_coverage=10,
            reference_length=10000)
Esempio n. 2
0
def test_pacbio_random():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))

    with TempFile() as fh:
        b.random_selection(fh.name, nreads=10)

    with TempFile() as fh:
        b.random_selection(fh.name,  expected_coverage=10,
            reference_length=10000)
Esempio n. 3
0
 def __init__(self, filename):
     self.filename = filename
     self.bam = PacbioSubreads(self.filename)
     self._df = None
Esempio n. 4
0
class PacbioIsoSeqMappedIsoforms(object):
    """Here, we load a SAM/BAM file generated with minimap using
    as input the BAM file  created with the mapping og HQ isoforms
    on a reference.

    df contains a dataframe for each read found in the SAM (and hq_isoform)
    we populate the GC content, the mapping flag, the reference name (-1 means
    no mapping i.e flag ==4). flag of 4 means unmapped and there is no
    ambiguity about it.

    In the data file example, other falgs are 0, 16 (SEQ being reverse
    complement<F12>) , 2048 (supplementary     segment).

    Example of minimap2 command::

        minimap2 -t 4  -ax splice -uf --secondary=no  SIRV-E0.fa
            hq_isoforms.fasta 1> hq_isoforms.fasta.sam 2> hq_isoforms.fasta.sam.log

    Reads a SAM file for now. BAM should work as well

    """
    def __init__(self, filename):
        self.filename = filename
        self.bam = PacbioSubreads(self.filename)
        self._df = None

    @property
    def df(self):
        if self._df is not None:
            return self._df

        # !! for isoseq, we should be able to load everything into memory
        self.bam.reset()
        data = [a for a in self.bam.data]

        df = self.bam.df.copy()

        rnames = [self.bam.data.get_reference_name(a.rname) if a.rname!=-1
                  else -1 for a in data]

        df['reference_name'] = rnames
        df['flags'] = [a.flag for a in data]
        df['mapq'] = [a.mapq for a in data]
        df['cigar'] = [a.cigarstring for a in data]
        df['qname'] = [a.qname for a in data]

        # Drop SNR that are not populated in the mapped BAM file.
        df.drop(['snr_A', 'snr_C', 'snr_G', 'snr_T'], axis=1, inplace=True)


        # TODO. input could be basde on mapping of CCS in which case, the ZMW is
        # stored and the following does not work. could check whether the
        # pattern is pXXfXX
        try:
            df["full_length"] = df["qname"].apply(lambda x: int(x.split('/')[1].split("p")[0].strip("f")))
            df["non_full_length"] = df["qname"].apply(lambda x: int(x.split("/")[1].split("p")[1].strip("f")))
        except:
            pass

        self._df = df

        return self._df

    def hist_isoform_length_mapped_vs_unmapped(self, bins=None):
        df = self.df
        if bins is None:
            bins = range(0, len(df.reference_length.max()), 100)
        mapped = df[df.reference_name != -1]
        unmapped = df[df.reference_name == -1]
        pylab.hist(mapped.reference_length, bins=bins, alpha=0.5,
            label="mapped {}".format(len(mapped)), density=False)
        pylab.hist(unmapped.reference, bins=bins, alpha=0.5,
            label="unmapped {}".format(len(unmapped)), density=False)
        pylab.xlabel("Isoform length")
        pylab.legend()

    def hist_transcript(self, hide_unmapped=True):
        pylab.clf()

        if hide_unmapped is True:
            query = "reference_length>0 and reference_name!=-1"
        else:
            query = "reference_length>0"

        print(query)
        ts = self.df.query(query).groupby("reference_name").count().reference_length
        if len(ts) == 0:
            print("nothing to plot")
            return ts

        ts.plot(kind="bar" ,color="r")
        try: pylab.tight_layout()
        except: pass
        return ts

    def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12):
        self.df.mapq.hist()
        if logy:
            pylab.semilogy()
        pylab.xlim([xmin, xmax])
        pylab.xlabel("Mapping quality", fontsize=fontsize)

    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Esempio n. 5
0
def test_pacbio_stride():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
    with TempFile() as fh:
        b.stride(fh.name, stride=2)
    with TempFile() as fh:
        b.stride(fh.name, stride=2, random=True)
Esempio n. 6
0
def test_pacbio():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
    assert len(b) == 130
    b.df
    #assert b.nb_pass[1] == 130


    with TempFile() as fh:
        b.filter_length(fh.name, threshold_min=500)

    print(b)   #  check length

    assert b.stats['mean_GC'] > 62.46
    assert b.stats['mean_GC'] < 65.47

    b.summary()

    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))

    # test hist_snr from scratch
    b._df = None
    b.hist_snr()

    # test hist_len from scratch
    b._df = None
    b.hist_read_length()
    b.hist_nb_passes()
    b.get_mean_nb_passes()

    # test from scratch
    b._df = None
    b.hist_GC()

    # test from scratch
    b._df = None
    b.plot_GC_read_len()

    # test from scratch
    b._df = None

    with TempFile() as fh:
        b.to_fasta(fh.name, threads=1)
    with TempFile() as fh:
        b.to_fastq(fh.name, threads=1)
    with TempFile() as fh:
        b.save_summary(fh.name)
Esempio n. 7
0
 def __init__(self, filename):
     self.filename = filename
     self.bam = PacbioSubreads(self.filename)
     self._df = None
Esempio n. 8
0
class PacbioIsoSeqMappedIsoforms(object):
    """Here, we load a SAM/BAM file generated with minimap using
    as input the BAM file  created with the mapping og HQ isoforms
    on a reference.

    df contains a dataframe for each read found in the SAM (and hq_isoform)
    we populate the GC content, the mapping flag, the reference name (-1 means
    no mapping i.e flag ==4). flag of 4 means unmapped and there is no
    ambiguity about it.

    In the data file example, other flags are 0, 16 (SEQ being reverse
    complement<F12>) , 2048 (supplementary     segment).

    Example of minimap2 command::

        minimap2 -t 4  -ax splice -uf --secondary=no  SIRV-E0.fa
            hq_isoforms.fasta 1> hq_isoforms.fasta.sam 2> hq_isoforms.fasta.sam.log

    Reads a SAM file for now. BAM should work as well

    """
    def __init__(self, filename):
        self.filename = filename
        self.bam = PacbioSubreads(self.filename)
        self._df = None

    @property
    def df(self):
        if self._df is not None:
            return self._df

        # !! for isoseq, we should be able to load everything into memory
        self.bam.reset()
        data = [a for a in self.bam.data]

        df = self.bam.df.copy()

        rnames = [
            self.bam.data.get_reference_name(a.rname) if a.rname != -1 else -1
            for a in data
        ]

        df['reference_name'] = rnames
        df['flags'] = [a.flag for a in data]
        df['mapq'] = [a.mapq for a in data]
        df['cigar'] = [a.cigarstring for a in data]
        df['qname'] = [a.qname for a in data]

        # Drop SNR that are not populated in the mapped BAM file.
        df.drop(['snr_A', 'snr_C', 'snr_G', 'snr_T'], axis=1, inplace=True)

        # TODO. input could be basde on mapping of CCS in which case, the ZMW is
        # stored and the following does not work. could check whether the
        # pattern is pXXfXX
        try:
            df["full_length"] = df["qname"].apply(
                lambda x: int(x.split('/')[1].split("p")[0].strip("f")))
            df["non_full_length"] = df["qname"].apply(
                lambda x: int(x.split("/")[1].split("p")[1].strip("f")))
        except:
            pass

        self._df = df

        return self._df

    def hist_isoform_length_mapped_vs_unmapped(self, bins=None):
        df = self.df
        if bins is None:
            bins = range(0, len(df.reference_length.max()), 100)
        mapped = df[df.reference_name != -1]
        unmapped = df[df.reference_name == -1]
        pylab.hist(mapped.reference_length,
                   bins=bins,
                   alpha=0.5,
                   label="mapped {}".format(len(mapped)),
                   density=False)
        pylab.hist(unmapped.reference,
                   bins=bins,
                   alpha=0.5,
                   label="unmapped {}".format(len(unmapped)),
                   density=False)
        pylab.xlabel("Isoform length")
        pylab.legend()

    def hist_transcript(self, hide_unmapped=True):
        pylab.clf()

        if hide_unmapped is True:
            query = "reference_length>0 and reference_name!=-1"
        else:
            query = "reference_length>0"

        print(query)
        ts = self.df.query(query).groupby(
            "reference_name").count().reference_length
        if len(ts) == 0:
            print("nothing to plot")
            return ts

        ts.plot(kind="bar", color="r")
        try:
            pylab.tight_layout()
        except:
            pass
        return ts

    def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12):
        self.df.mapq.hist()
        if logy:
            pylab.semilogy()
        pylab.xlim([xmin, xmax])
        pylab.xlabel("Mapping quality", fontsize=fontsize)

    def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1):
        aa = self.df.query("reference_name not in [-1, '-1']").copy()
        if len(aa) == 0:
            return pd.Series(), self.df

        aa['group'] = aa.reference_name.apply(lambda x: x[0:shift])
        mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"]
        mapped.name = None

        if plot:
            mapped.plot(kind="bar")
            pylab.title(title)
            pylab.tight_layout()
        #data.to_csv(path + "_hq_sirv_grouped.csv")
        return mapped, self.df
Esempio n. 9
0
"""
read length histograms pacbio data
=====================================

QC pacbio example

"""

########################################
# First, let us get a data set example.
# Note the .bam extension
from sequana import sequana_data
dataset = sequana_data("test_pacbio_subreads.bam")

#############################################
# Create a :class:`sequana.pacbio.BAMPacbio` instance
from sequana.pacbio import PacbioSubreads
qc = PacbioSubreads(dataset)

#########################################
# plot the histogram of read length
qc.hist_read_length()

#################################################
# plot the histogram of the SNRs for each base
qc.hist_snr()
Esempio n. 10
0
def test_pacbio_stride():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
    with TempFile() as fh:
        b.stride(fh.name, stride=2)
    with TempFile() as fh:
        b.stride(fh.name, stride=2, random=True)
Esempio n. 11
0
def test_pacbio():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
    assert len(b) == 130
    b.df
    #assert b.nb_pass[1] == 130


    with TempFile() as fh:
        b.filter_length(fh.name, threshold_min=500)

    print(b)   #  check length

    assert b.stats['mean_GC'] > 62.46
    assert b.stats['mean_GC'] < 65.47

    b.summary()

    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))

    # test hist_snr from scratch
    b._df = None
    b.hist_snr()

    # test hist_len from scratch
    b._df = None
    b.hist_read_length()
    b.hist_nb_passes()
    b.get_mean_nb_passes()

    # test from scratch
    b._df = None
    b.hist_GC()

    # test from scratch
    b._df = None
    b.plot_GC_read_len()

    # test from scratch
    b._df = None

    with TempFile() as fh:
        b.to_fasta(fh.name, threads=1)
    with TempFile() as fh:
        b.to_fastq(fh.name, threads=1)
    with TempFile() as fh:
        b.save_summary(fh.name)