def test_compressor_dsrc():
    """Test dsrc codecs to gz and bz2"""
    # Create a temporary directory and chdir in it:
    tempdir = tempfile.TemporaryDirectory()
    filename = sequana_data("test.fastq.gz")
    shutil.copy(filename, tempdir.name)
    cwd = os.path.abspath(os.curdir)
    os.chdir(tempdir.name)

    # We concert gz -> dsrc -> bz2 -> dsrc -> gz and must get the exact same
    # However, since the compression is not deterministic, we should compare the
    # content of the uncompressed file (input and output)
    try:
        compressor.main([
            prog, "--source", "fastq.gz", "--target", "fastq.dsrc", "--quiet"
        ])
        compressor.main([
            prog, "--source", "fastq.dsrc", "--target", "fastq.bz2", "--quiet"
        ])
        compressor.main([
            prog, "--source", "fastq.bz2", "--target", "fastq.dsrc", "--quiet"
        ])
        compressor.main([
            prog, "--source", "fastq.dsrc", "--target", "fastq.gz", "--quiet"
        ])
    except Exception as err:
        raise Exception(err)
    finally:
        os.chdir(cwd)

    f1 = FastQ(filename)
    f2 = FastQ(tempdir.name + os.sep + os.path.basename(filename))
    assert f1 == f2
Esempio n. 2
0
    def is_synchronised(self):
        from sequana import FastQ
        N = 0
        for a, b in zip(FastQ(self.fq1), FastQ(self.fq2)):
            a = a['identifier'].decode()
            b = b['identifier'].decode()
            a = a.split()[0]
            b = b.split()[0]

            if a.endswith("/1"):
                id1 = a.rsplit("/1")[0]
            elif a.endswith("/2"):
                id1 = a.rsplit("/2")[0]
            else:
                id1 = a
            if b.endswith("/1"):
                id2 = b.rsplit("/1")[0]
            elif b.endswith("/2"):
                id2 = b.rsplit("/2")[0]
            else:
                id2 = b

            if id1 != id2:
                print("%s differs from %s" % (id1, id2))
                print(a)
                print(b)
                return False
            N += 1
        print(N)
        return True
Esempio n. 3
0
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
Esempio n. 4
0
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
Esempio n. 5
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    reference = options.reference
    if options.file1 and options.file2:
        fastq = "%s %s" % (options.file1, options.file2)
    elif options.file1 and not options.file2:
        fastq = "%s" % (options.file1)
    elif options.file1 is None:
        raise ValueError("--file1 must be used")

    from sequana import FastQ
    from sequana import FastA
    S = 0
    for this in FastQ(options.file1):
        S += len(this['sequence'])
    if options.file2:
        for this in FastQ(options.file2):
            S += len(this['sequence'])
    ref = FastA(options.reference)
    coverage = float(S) / len(ref.sequences[0])
    print('Theoretical Depth of Coverage : %s' % coverage)

    params = {"reference": reference, "fastq": fastq, "thread": options.thread}

    # indexing
    shellcmd("bwa index %(reference)s " % params)
    cmd = "samtools faidx %(reference)s " % params

    # mapping
    cmd = "bwa mem -M "  # mark shorter split read as secondary; -M is not compulsary but recommended
    if options.pacbio:
        cmd += "-x pacbio "
    cmd += r" -t %(thread)s -R @RG\\tID:1\\tSM:1\\tPL:illumina -T 30 %(reference)s %(fastq)s  "

    # Samtools options:
    #   S:ignore input format
    #   h:include header
    #   b:bam output
    if options.sambamba is False:
        cmd += "| samtools view -Sbh | "
        # sorting BAM
        cmd += "samtools sort -@ %(thread)s -o %(reference)s.sorted.bam -"
        shellcmd(cmd % params)
    else:
        # FIXME use sambamba for the view as well
        cmd += "| samtools view -Sbu - | sambamba sort /dev/stdin -o %(reference)s.sorted.bam -t %(thread)s  --tmpdir=./tmp  " % params
        shellcmd(cmd % params)
Esempio n. 6
0
def test_fastq_unzipped():

    for thisdata in [data, datagz]:
        # isntanciation
        f = fastq.FastQ(thisdata)
        assert f.data_format == "Illumina_1.8+"
        # count lines
        # rune it twice because we want to make sure re-running count_lines
        # (decompression with zlib) works when run again.
        assert f.count_lines() == 1000
        assert f.count_lines() == 1000
        assert f.count_reads() == 250
        assert f.count_reads() == 250

        # extract head of the file into an unzipped file
        ft = TempFile()
        f.extract_head(100, ft.name)
        fcheck = fastq.FastQ(ft.name)
        assert fcheck.count_lines() == 100
        ft.delete()

        # extract head of the file and zip output
        ft = TempFile(suffix=".gz")
        f.extract_head(100, ft.name)
        fcheck = fastq.FastQ(ft.name)
        assert fcheck.count_lines() == 100
        ft.delete()

        with FastQ(thisdata) as ff:
            assert len(ff) == 250

        with TempFile() as fh:
            selection = f.select_random_reads(10, fh.name)
            f.select_random_reads(selection, fh.name)
Esempio n. 7
0
def test_filter():
    f = fastq.FastQ(data)
    # keeps all

    with TempFile() as fh:
        f.filter(min_bp=80,
                 max_bp=120,
                 output_filename=fh.name,
                 progressbar=False)
        assert len(f) == 250
        ff = FastQ(fh.name)
        assert len(ff) == 250

    # keeps nothing
    with TempFile() as fh:
        f.filter(min_bp=80, max_bp=90, output_filename=fh.name)
        assert len(f) == 250
        ff = FastQ(fh.name)
        assert len(ff) == 0
def test_compressor_running():
    # Here we test gz -> bz2 -> gz -> dsrc -> gz using recursive or not
    # get a fastq.gz in a temp file and process it
    tempdir = tempfile.TemporaryDirectory()
    filename = sequana_data("test.fastq.gz")
    shutil.copy(filename, tempdir.name)
    cwd = os.path.abspath(os.curdir)
    os.chdir(tempdir.name)

    # We concert gz -> bz2 -> gz and must get the exact same
    # However, since the compression is not deterministic, we should compare the
    # content of the uncompressed file (input and output)
    try:
        # seems to fail on travis with a subprocess issue
        # https://travis-ci.org/sequana/sequana/builds/162466158
        compressor.main(
            [prog, "--source", "fastq.gz", "--target", "fastq.bz2", "--quiet"])
        compressor.main([
            prog, "--source", "fastq.bz2", "--target", "fastq.gz",
            "--recursive", "--quiet"
        ])
        compressor.main([
            prog, "--source", "fastq.gz", "--target", "fastq.dsrc",
            "--recursive", "--quiet"
        ])
        compressor.main([
            prog, "--source", "fastq.dsrc", "--target", "fastq.gz", "--quiet"
        ])
    except Exception as err:
        raise Exception(err)
    finally:
        os.chdir(cwd)

    f1 = FastQ(filename)
    f2 = FastQ(tempdir.name + os.sep + os.path.basename(filename))
    assert f1 == f2
Esempio n. 9
0
    def __init__(self, directory=".", prefix="job-*"):
        self.prefix = prefix
        self.directory = directory

        # low quality isoforms
        self.lq_isoforms = self.get_file("lq_isoforms.fastq")
        if self.lq_isoforms:
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        self.hq_isoforms = self.get_file("hq_isoforms.fastq")
        if self.hq_isoforms:
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        self.csv = self.get_file("-file.csv")
        if self.csv:
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        self.ccs = self.get_file("ccs.fasta", noprefix=True)
        if self.ccs:
            self.ccs = FastA(self.ccs)
Esempio n. 10
0
def test_bam(tmpdir):
    datatest = sequana_data("test.bam", "testing")
    s = BAM(datatest)
    assert len(s) == 1000
    assert s.is_sorted is True
    df = s.get_df_concordance()
    assert s.is_paired is True
    assert int(df.length.sum()) == 67938
    assert int(df.M.sum()) == 67788

    df = s.get_df()

    # call this here before other computations on purpose
    with TempFile(suffix=".json") as fh:
        s.bam_analysis_to_json(fh.name)

    assert s.get_read_names()
    s.get_mapped_read_length()

    s.get_stats()
    s.get_stats_full()
    s.get_samtools_stats_as_df()

    with TempFile() as fh:
        s.to_fastq(fh.name)
        from sequana import FastQ
        ff = FastQ(fh.name)
        len(ff) == len(s)

    # plotting
    with TempFile(suffix='.png') as fh:
        s.plot_bar_flags(filename=fh.name, logy=True)
        s.plot_bar_flags(filename=fh.name)

    with TempFile(suffix='.png') as fh:
        s.plot_bar_mapq(filename=fh.name)

    s.get_gc_content()
    s.get_length_count()
    s.plot_gc_content()
    s.boxplot_qualities()
    s.boxplot_qualities(max_sample=50)
    try:
        s.plot_gc_content(bins=[1, 2, 10])
        assert False
    except:
        assert True
Esempio n. 11
0
def test_bam(tmpdir):

    s = BAM(datatest)
    assert len(s) == 1000
    assert s.is_sorted is True

    assert len(list(s.iter_unmapped_reads())) == 2
    s.reset()
    assert len(list(s.iter_mapped_reads())) == 998
    s.reset()

    # call this here before other computations on purpose
    with TempFile(suffix=".json") as fh:
        s.bam_analysis_to_json(fh.name)

    assert s.get_read_names()
    s.get_mapped_read_length()

    s.get_stats()
    s.get_full_stats_as_df()

    with TempFile(suffix='.png') as fh:
        s.plot_bar_flags(filename=fh.name, logy=True)
        s.plot_bar_flags(filename=fh.name)

    with TempFile(suffix='.png') as fh:
        s.plot_bar_mapq(filename=fh.name)

    with TempFile() as fh:
        s.to_fastq(fh.name)
        from sequana import FastQ
        ff = FastQ(fh.name)
        len(ff) == len(s)

    s.get_gc_content()
    s.get_length_count()
    s.plot_gc_content()
    try:
        s.plot_gc_content(bins=[1, 2, 10])
        assert False
    except:
        assert True
Esempio n. 12
0
    def __init__(self, infile, references, outdir, mapper, threads=4):
        self.infile = infile
        self.references = references

        self.outdir = outdir
        self.threads = threads

        if os.path.exists(outdir):
            logger.info("using {} for output".format(outdir))
        else:
            os.mkdir(outdir)

        # this may be used later on for other mapper or methodology
        if mapper == "minimap2":
            self.mapper_cmd = "minimap2 -x map-pb -t {} {} {} -a > {}"
        elif mapper == "bwa":
            self.mapper_cmd = "bwa mem -M -t {} {} {} > {}"

        f = FastQ(self.infile)
        self.L = len(f)
        logger.info("Found {} reads in input FastQ file\n\n".format(self.L))
Esempio n. 13
0
class IsoSeqQC(object):
    """


    Use get_isoseq_files on smrtlink to get the proper files

    iso = IsoSeqQC()
    iso.hist_read_length_consensus_isoform() # histo CCS 
    iso.stats() # "CCS" key is equivalent to summary metrics in CCS report


    todo: get CCS passes histogram . Where to get the info of passes ? 

    """
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)

    def get_file(self, tag, noprefix=False):
        if noprefix:
            filenames = glob.glob(self.directory + os.sep + tag)
        else:
            filenames = glob.glob(self.directory + os.sep + self.prefix + tag)
        if len(filenames) == 1:
            return filenames[0]
        elif len(filenames) > 1:
            print("Found several files ending in %s" % tag)
        else:
            print("No files matching %s" % tag)
        return None

    def stats(self):
        results = {}
        if self.data is not None:
            logger.info("Reading strand")
            results['strand'] = {
                "+": sum(self.data.strand == "+"),
                "-": sum(self.data.strand == "-"),
                "?": sum(self.data.strand.isnull())
            }

            results['classification'] = {
                "total_ccs_reads" : len(self.data),
                "five_prime_reads" : int(self.data.fiveseen.sum()),
                "three_prime_reads" : int(self.data.threeseen.sum()),
                "chimera" : int(self.data.chimera.sum()),
                "polyA_reads" : int(self.data.polyAseen.sum()),
            }

        if self.lq_isoforms:
            logger.info("Reading LQ isoforms")
            results['lq_isoform'] = self.lq_sequence.stats() # number of 

        if self.hq_isoforms:
            logger.info("Reading HQ isoforms")
            results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform

        if self.ccs:
            seq = [ len(read.sequence) for read in self.ccs]
            results["CCS"] = {
                "mean_length" : pylab.mean(seq),
                "number_ccs_bases" : sum(seq),
                "number_ccs_reads" : len(seq)
            }

        self.idents_v = []
        self.full_v = []
        self.non_full_v = []
        self.isoform_lengths = []
        for read in self.lq_sequence:
            ident, full, non_full, length = read['identifier'].decode().split(";")
            self.idents_v.append(ident)
            self.full_v.append(int(full.split("=")[1]))
            self.non_full_v.append(int(non_full.split("=")[1]))
            self.isoform_lengths.append(int(length.split("=")[1]))

        return results


    def to_summary(self, filename="sequana_summary_isoseq.json", data=None):
        """Save statistics into a JSON file

        :param filename:
        :param data: dictionary to save. If not provided, use :meth:`stats`

        """
        from sequana.summary import Summary
        if data is None:
            data = self.stats()
        Summary("isoseq",self.sample_name, data=data).to_json(filename)


    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")

    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Esempio n. 14
0
class IsoSeqQC(object):
    """


    Use get_isoseq_files on smrtlink to get the proper files

    iso = IsoSeqQC()
    iso.hist_read_length_consensus_isoform() # histo CCS 
    iso.stats() # "CCS" key is equivalent to summary metrics in CCS report


    todo: get CCS passes histogram . Where to get the info of passes ? 

    """
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)

    def get_file(self, tag, noprefix=False):
        if noprefix:
            filenames = glob.glob(self.directory + os.sep + tag)
        else:
            filenames = glob.glob(self.directory + os.sep + self.prefix + tag)
        if len(filenames) == 1:
            return filenames[0]
        elif len(filenames) > 1:
            print("Found several files ending in %s" % tag)
        else:
            print("No files matching %s" % tag)
        return None

    def stats(self):
        results = {}
        if self.data is not None:
            logger.info("Reading strand")
            results['strand'] = {
                "+": sum(self.data.strand == "+"),
                "-": sum(self.data.strand == "-"),
                "?": sum(self.data.strand.isnull())
            }

            results['classification'] = {
                "total_ccs_reads" : len(self.data),
                "five_prime_reads" : int(self.data.fiveseen.sum()),
                "three_prime_reads" : int(self.data.threeseen.sum()),
                "chimera" : int(self.data.chimera.sum()),
                "polyA_reads" : int(self.data.polyAseen.sum()),
            }

        if self.lq_isoforms:
            logger.info("Reading LQ isoforms")
            results['lq_isoform'] = self.lq_sequence.stats() # number of 

        if self.hq_isoforms:
            logger.info("Reading HQ isoforms")
            results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform

        if self.ccs:
            seq = [ len(read.sequence) for read in self.ccs]
            results["CCS"] = {
                "mean_length" : pylab.mean(seq),
                "number_ccs_bases" : sum(seq),
                "number_ccs_reads" : len(seq)
            }

        self.idents_v = []
        self.full_v = []
        self.non_full_v = []
        self.isoform_lengths = []
        for read in self.lq_sequence:
            ident, full, non_full, length = read['identifier'].decode().split(";")
            self.idents_v.append(ident)
            self.full_v.append(int(full.split("=")[1]))
            self.non_full_v.append(int(non_full.split("=")[1]))
            self.isoform_lengths.append(int(length.split("=")[1]))

        return results

    def to_summary(self, filename="sequana_summary_isoseq.json", data=None):
        """Save statistics into a JSON file

        :param filename:
        :param data: dictionary to save. If not provided, use :meth:`stats`

        """
        from sequana.summary import Summary
        if data is None:
            data = self.stats()
        Summary("isoseq",self.sample_name, data=data).to_json(filename)

    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")

    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Esempio n. 15
0
class IsoSeq(object):
    """


    ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/

    """
    def __init__(self, directory=".", prefix="job-*"):
        self.prefix = prefix
        self.directory = directory

        # low quality isoforms
        self.lq_isoforms = self.get_file("lq_isoforms.fastq")
        if self.lq_isoforms:
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        self.hq_isoforms = self.get_file("hq_isoforms.fastq")
        if self.hq_isoforms:
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        self.csv = self.get_file("-file.csv")
        if self.csv:
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        self.ccs = self.get_file("ccs.fasta", noprefix=True)
        if self.ccs:
            self.ccs = FastA(self.ccs)

    def get_file(self, tag, noprefix=False):
        if noprefix:
            filenames = glob.glob(self.directory + os.sep + tag)
        else:
            filenames = glob.glob(self.directory + os.sep + self.prefix + tag)
        if len(filenames) == 1:
            return filenames[0]
        elif len(filenames) > 1:
            print("Found several files ending in %s" % tag)
        else:
            print("No files matching %s" % tag)
        return None

    def stats(self):
        results = {}
        if self.data is not None:
            print("Reading strand")
            results['strand'] = {
                "+": sum(self.data.strand == "+"),
                "-": sum(self.data.strand == "-"),
                "?": sum(self.data.strand.isnull())
            }

            results['classification'] = {
                "total_ccs_reads": len(self.data),
                "five_prime_reads": sum(self.data.fiveseen),
                "three_prime_reads": sum(self.data.threeseen),
                "polyA_reads": sum(self.data.polyAseen),
            }

        if self.lq_isoforms:
            print("Reading LQ isoform")
            results['lq_isoform'] = self.lq_sequence.stats()  # number of

        if self.hq_isoforms:
            print("Reading HQ isoform")
            results['hq_isoform'] = self.hq_sequence.stats(
            )  # number of polished HQ isoform

        if self.ccs:
            seq = [read.sequence for read in self.ccs]
            results["CCS"] = {
                "mean_length": pylab.mean(seq),
                "number_ccs_bases": sum(seq),
                "number_ccs_reads": len(seq)
            }

        self.idents_v = []
        self.full_v = []
        self.non_full_v = []
        self.isoform_lengths = []
        for read in self.lq_sequence:
            ident, full, non_full, length = read['identifier'].decode().split(
                ";")
            self.idents_v.append(ident)
            self.full_v.append(int(full.split("=")[1]))
            self.non_full_v.append(int(non_full.split("=")[1]))
            self.isoform_lengths.append(int(length.split("=")[1]))

        return results

    def hist_read_length_consensus_isoform(self,
                                           mode="all",
                                           bins=80,
                                           rwidth=0.8,
                                           align="left",
                                           fontsize=16,
                                           edgecolor="k",
                                           **kwargs):
        """

        mode can be all, lq, hq
        """

        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2

        Y, X, _ = pylab.hist(L,
                             bins=bins,
                             rwidth=rwidth,
                             align=align,
                             ec=edgecolor,
                             **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[1:] - shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")

    def hist_average_quality(self, fontsize=16):

        hq_qv = [
            mean([phred.ascii_to_quality(X) for X in read['quality'].decode()])
            for read in iso.hq_sequence
        ]
        lq_qv = [
            mean([phred.ascii_to_quality(X) for X in read['quality'].decode()])
            for read in iso.lq_sequence
        ]

        Y1, X = numpy.histogram(hq_qv, bins=range(0, 94))
        Y2, X = numpy.histogram(lq_qv, bins=range(0, 94))
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)