Ejemplo n.º 1
0
def fastq_truncate(fname, max_len):
    cs = is_colorspace_fastq(fname)

    for name, seq, qual in read_fastq(fname):
        if cs and seq[0] in "atcgATGC":
            sys.stdout.write('%s\n%s\n+\n%s\n' % (name, seq[:max_len + 1], qual[:max_len]))
        else:
            sys.stdout.write('%s\n%s\n+\n%s\n' % (name, seq[:max_len], qual[:max_len]))
Ejemplo n.º 2
0
def fastq_csencode(fname):
    for name, seq, qual in read_fastq(fname, quiet=False):
        if seq:
            if seq[0] in 'ATCG':
                seq = seq[2:]
            else:
                seq = seq[1:]

            sys.stdout.write('%s\n%s\n+\n%s\n' % (name, encoded_seq(seq), qual[1:]))
Ejemplo n.º 3
0
def fastq_truncate(fname, max_len):
    cs = is_colorspace_fastq(fname)

    for name, seq, qual in read_fastq(fname):
        if cs and seq[0] in "atcgATGC":
            sys.stdout.write('%s\n%s\n+\n%s\n' %
                             (name, seq[:max_len + 1], qual[:max_len]))
        else:
            sys.stdout.write('%s\n%s\n+\n%s\n' %
                             (name, seq[:max_len], qual[:max_len]))
Ejemplo n.º 4
0
    def test_read_qualities(self):
        """
        Tests the ability to get a list of bases and their corresponding qualties from a fastq file
        :return:
        """

        sequences, qualities = fqu.read_fastq(full_file_name)

        self.assertGreater(len(sequences), 0)
        self.assertGreater(len(qualities), 0)
Ejemplo n.º 5
0
def fastq_csencode(fname):
    for name, seq, qual in read_fastq(fname, quiet=False):
        if seq:
            if seq[0] in 'ATCG':
                seq = seq[2:]
            else:
                seq = seq[1:]

            sys.stdout.write('%s\n%s\n+\n%s\n' %
                             (name, encoded_seq(seq), qual[1:]))
Ejemplo n.º 6
0
def fastq_trim(fname, linker_5=None, linker_3=None, out=sys.stdout, pct_identity=0.8, min_trim=4, min_len=25, verbose=False):
    '''
    fname - the fastq filename
    linker_5 - the 5' linker to remove
    linker_3 - the 3' linker to remove
    out - an output stream (eg: file, stdout)
    pct_identity - the percentage of matches that must be present in the alignment to strip away linkers
    min_trim - the distance away from the edges that the linkers much match w/in
    '''

    cs = is_colorspace_fastq(fname)
    sw = support.localalign.LocalAlignment(support.localalign.NucleotideScoringMatrix(2, -1), -1)
    removed = 0
    trimmed = 0
    for name, seq, qual in read_fastq(fname):
        if verbose:
            sys.stderr.write('Read: %s\n    : %s\n' % (name, seq))
        left = 0
        right = len(seq)

        if linker_5:
            aln = sw.align(seq, linker_5)
            if verbose:
                sys.stderr.write("5' alignment:\n")
                aln.dump(sys.stderr)
            if aln.r_pos < min_trim and aln.identity > pct_identity:
                left = aln.r_end

        if linker_3:
            aln = sw.align(seq, linker_3)
            if verbose:
                sys.stderr.write("3' alignment:\n")
                aln.dump(sys.stderr)
            if aln.r_end > len(seq) - min_trim and aln.identity > pct_identity:
                right = aln.r_pos

        s = seq[left:right]
        if len(s) >= min_len:
            if left > 0 or right < len(seq):
                trimmed += 1
            if cs and len(seq) != len(qual) and left == 0:
                out.write('%s\n%s\n+\n%s\n' % (name, s, qual[left:right - 1]))
            else:
                out.write('%s\n%s\n+\n%s\n' % (name, s, qual[left:right]))
        else:
            removed += 1

        # out.write('%s\n%s (%s-%s)\n' % (name,seq,left,right))
        # out.write('x'*left)
        # out.write(seq[left:right])
        # out.write('x' *(len(seq)-right))
        # out.write('\n')
    sys.stderr.write('Trimmed: %s\n' % trimmed)
    sys.stderr.write('Removed: %s (len)\n' % removed)
Ejemplo n.º 7
0
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None):
    i = 0

    if ignore_pairs:
        is_paired = False
    else:
        is_paired = is_paired_fastq(fname)

    outs = []
    fnames = []
    for i in xrange(chunks):
        if gz:
            fn = '%s.%s.fastq.gz' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            sys.stderr.write('Output file: %s\n' % fn)
            outs.append(gzip.open(tmp, 'w'))
        else:
            fn = '%s.%s.fastq' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            sys.stderr.write('Output file: %s\n' % fn)
            outs.append(open(tmp, 'w'))

    i = 0
    last_name = None
    count = 0

    for name, seq, qual in read_fastq(fname):
        count += 1
        sn = name.split()[0]
        if not is_paired:
            i += 1
        elif sn != last_name:
            i += 1

        if i >= len(outs):
            i = 0

        last_name = sn

        outs[i].write('%s\n%s\n+\n%s\n' % (name, seq, qual))

    for out in outs:
        out.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)

    if count_fname:
        with open(count_fname, 'w') as f:
            f.write(count)
Ejemplo n.º 8
0
    def test_create_hist(self):
        """
        Tests the building of a histigram of quality scores from fastq file
        :return:
        """

        sequences, qualities = fqu.read_fastq(full_file_name)

        hist = fqu.create_hist(qualities)

        self.assertEqual(len(hist), 50)

        #hist of read qualities
        plt.bar(range(len(hist)), hist)
        plt.show()
Ejemplo n.º 9
0
    def test_find_gc_by_pos(self):
        """
        Tests the GC by position function
        :return:
        """

        sequences, qualities = fqu.read_fastq(full_file_name)

        gc_by_pos = fqu.find_gc_by_pos(sequences)

        self.assertEqual(len(gc_by_pos), 100)

        # line plot of GC ratio for these reads
        plt.plot(range(len(gc_by_pos)), gc_by_pos)
        plt.show()
Ejemplo n.º 10
0
    def test_fastq_base_dist(self):
        """
        Quick test to get the base distribution of our sequences
        :return:
        """

        sequences, qualities = fqu.read_fastq(full_file_name)

        sequence = "".join(sequences)

        base_dist = dnau.get_frequency_counts(sequence)

        self.assertEqual(base_dist['A'], 21132)
        self.assertEqual(base_dist['C'], 28272)
        self.assertEqual(base_dist['G'], 28742)
        self.assertEqual(base_dist['T'], 21836)
        # 'N' means not confident
        self.assertEqual(base_dist['N'], 18)
Ejemplo n.º 11
0
def fastq_tag(fname, prefix, suffix):
    for name, seq, qual in read_fastq(fname):
        spl = name[1:].split(None, 1)
        nname = ''
        if len(spl) > 1:
            desc = spl[1]
        else:
            desc = None

        if prefix and suffix:
            nname = '%s%s%s' % (prefix, spl[0], suffix)
        elif prefix:
            nname = '%s%s' % (prefix, spl[0])
        elif suffix:
            nname = '%s%s' % (spl[0], suffix)

        if desc:
            nname = '%s %s' % (nname, desc)

        sys.stdout.write('@%s\n%s\n+\n%s\n' % (nname, seq, qual))
Ejemplo n.º 12
0
def fastq_tag(fname, prefix, suffix):
    for name, seq, qual in read_fastq(fname):
        spl = name[1:].split(None, 1)
        nname = ''
        if len(spl) > 1:
            desc = spl[1]
        else:
            desc = None

        if prefix and suffix:
            nname = '%s%s%s' % (prefix, spl[0], suffix)
        elif prefix:
            nname = '%s%s' % (prefix, spl[0])
        elif suffix:
            nname = '%s%s' % (spl[0], suffix)

        if desc:
            nname = '%s %s' % (nname, desc)

        sys.stdout.write('@%s\n%s\n+\n%s\n' % (nname, seq, qual))
Ejemplo n.º 13
0
def fastq_merge(fnames, split_slashes=False):
    infiles = []

    first = True
    for fname in fnames:
        gen = read_fastq(fname, quiet=not first)
        infiles.append((fname, gen))
        first = False

    while True:
        lastname = None

        try:
            for fname, generator in infiles:
                name, seq, qual = generator.next()

                if split_slashes and '/' in name:
                    spl = name.split('/', 1)
                    name = spl[0]
                    desc = ' /%s' % spl[1]
                else:
                    cols = name.split()
                    name = cols[0]
                    if len(cols) > 1:
                        desc = cols[1]
                    else:
                        desc = ''

                if not lastname:
                    lastname = name
                elif name != lastname:
                    sys.stderr.write('Files are not paired! (error in: %s)\nExpected: %s\nGot     : %s\n' % (fname, lastname, name))
                    sys.exit(1)

                sys.stdout.write('%s %s\n%s\n+\n%s\n' % (name, desc, seq, qual))
        except:
            break
Ejemplo n.º 14
0
def fastq_convertqual(fname):
    for name, seq, qual in read_fastq(fname):
        sys.stdout.write('@%s\n%s\n+\n%s\n' %
                         (name, seq, ''.join([chr(ord(q) - 31)
                                              for q in qual])))
Ejemplo n.º 15
0
 def filter(self):
     for tup in read_fastq(fname, quiet=not self.verbose):
         self.kept += 1
         if self.verbose:
             sys.stderr.write('[FASTQ] Read: %s\n' % tup[0])
         yield tup
Ejemplo n.º 16
0
def export_seq(fname):
    for name, seq, quals in read_fastq(fname, quiet=False):
        sys.stdout.write('>%s\n%s\n' % (name[1:], seq))
Ejemplo n.º 17
0
def export_qual(fname):
    for name, seq, quals in read_fastq(fname, quiet=False):
        sys.stdout.write('>%s\n%s\n' % (name[1:], ' '.join([str(ord(x) - 33) for x in quals])))
Ejemplo n.º 18
0
def fastq_stats(fname, verbose=False):
    cs = is_colorspace_fastq(fname)
    if cs:
        print "Space:\tcolorspace"
    else:
        print "Space:\tnucleotide"

    pairs = is_paired_fastq(fname)
    if pairs > 0:
        print "Pairing:\tPaired-end (%s)" % pairs
    else:
        print "Pairing:\tFragmented"

    qual_totals = fastq_qualtype(fname)

    print "Quality scale:\t%s" % qual_totals[-1][1]
    if verbose:
        print ' '.join(['(%s,%s)' % (q[1], q[0]) for q in qual_totals])

    lengths = []
    posquals = []
    qualities = []
    total = []
    total_reads = 0
    line = 0
    try:
        for name, seq, qual in read_fastq(fname):
            if not name[0] == '@':
                print 'Invalid formatted record [line %s]' % line
                break

            if cs:
                if len(seq) != len(qual) + 1:
                    print 'Seq / qual mismatch [line %s]' % line
                    break
            else:
                if len(seq) != len(qual):
                    print 'Seq / qual mismatch [line %s]' % line
                    break

            line += 4
            total_reads += 1

            while len(total) < len(qual) + 1:
                total.append(0)
            for x in xrange(len(qual) + 1):
                total[x] += 1

            while len(qual) > len(lengths) - 1:
                lengths.append(0)
                qualities.append(0)
                posquals.append([])

            lengths[len(qual)] += 1

            for idx, q in enumerate([ord(x) for x in qual]):
                qualities[idx] += q
                while len(posquals[idx]) < (q - 32):
                    posquals[idx].append(0)
                posquals[idx][q - 33] += 1

    except KeyboardInterrupt:
        pass

    print "Number of reads:\t%s" % total_reads
    print ""

    mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(lengths)

    print "Length distribution"
    print 'Mean:\t%s' % mean
    print 'StdDev:\t%s' % stdev
    print 'Min:\t%s' % min_val
    print '25 percentile:\t%s' % pct25
    print 'Median:\t%s' % pct50
    print '75 percentile:\t%s' % pct75
    print 'Max:\t%s' % max_val

    if verbose:
        print ''
        for idx, count in enumerate(lengths[::-1]):
            if count:
                print "%s\t%s" % (len(lengths) - idx - 1, count)

    print "Quality distribution"
    print "pos\tmean\tstdev\tmin\t25pct\t50pct\t75pct\tmax"

    for pos, quals in enumerate(posquals):
        if not quals:
            continue
        mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(
            quals)

        sys.stdout.write('%s\t' % (pos + 1))
        sys.stdout.write('\t'.join([str(x) for x in stats_counts(quals)]))
        sys.stdout.write('\n')

    print ""
    print "Quality by position"

    for i, x in enumerate(qualities):
        q = x / total[i]
        if q > 33:
            sys.stdout.write(chr(q))
        else:
            sys.stdout.write('~')

    if verbose:
        print ''
        for i, q in enumerate(qualities):
            print '[%s] %s' % (i, (q / total[i]) - 33)

    print ''
Ejemplo n.º 19
0
#!/usr/bin/env python
## category General
## desc Write out the read names
'''
Writes out the read names present in the FASTQ file.
'''

import os
import sys

from fastq_utils import read_fastq


def usage():
    print __doc__
    print "Usage: fastqutils names filename.fastq{.gz}"
    sys.exit(1)

if __name__ == '__main__':
    fname = None

    for arg in sys.argv[1:]:
        if os.path.exists(arg):
            fname = arg

    if not fname:
        usage()

    for name, seq, qual in read_fastq(fname):
        sys.stdout.write('%s\n' % name.split()[0][1:])
Ejemplo n.º 20
0
 def filter(self):
     for tup in read_fastq(fname, quiet=not self.verbose):
         self.kept += 1
         if self.verbose:
             sys.stderr.write('[FASTQ] Read: %s\n' % tup[0])
         yield tup
Ejemplo n.º 21
0
## category General
## desc Write out the read names
'''
Writes out the read names present in the FASTQ file.
'''

import os
import sys

from fastq_utils import read_fastq


def usage():
    print __doc__
    print "Usage: fastqutils names filename.fastq{.gz}"
    sys.exit(1)


if __name__ == '__main__':
    fname = None

    for arg in sys.argv[1:]:
        if os.path.exists(arg):
            fname = arg

    if not fname:
        usage()

    for name, seq, qual in read_fastq(fname):
        sys.stdout.write('%s\n' % name.split()[0][1:])
Ejemplo n.º 22
0
def fastq_stats(fname, verbose=False):
    cs = is_colorspace_fastq(fname)
    if cs:
        print "Space:\tcolorspace"
    else:
        print "Space:\tnucleotide"

    pairs = is_paired_fastq(fname)
    if pairs > 0:
        print "Pairing:\tPaired-end (%s)" % pairs
    else:
        print "Pairing:\tFragmented"

    qual_totals = fastq_qualtype(fname)

    print "Quality scale:\t%s" % qual_totals[-1][1]
    if verbose:
        print ' '.join(['(%s,%s)' % (q[1], q[0]) for q in qual_totals])

    lengths = []
    posquals = []
    qualities = []
    total = []
    total_reads = 0
    line = 0
    try:
        for name, seq, qual in read_fastq(fname):
            if not name[0] == '@':
                print 'Invalid formatted record [line %s]' % line
                break

            if cs:
                if len(seq) != len(qual) + 1:
                    print 'Seq / qual mismatch [line %s]' % line
                    break
            else:
                if len(seq) != len(qual):
                    print 'Seq / qual mismatch [line %s]' % line
                    break

            line += 4
            total_reads += 1

            while len(total) < len(qual) + 1:
                total.append(0)
            for x in xrange(len(qual) + 1):
                total[x] += 1

            while len(qual) > len(lengths) - 1:
                lengths.append(0)
                qualities.append(0)
                posquals.append([])

            lengths[len(qual)] += 1

            for idx, q in enumerate([ord(x) for x in qual]):
                qualities[idx] += q
                while len(posquals[idx]) < (q - 32):
                    posquals[idx].append(0)
                posquals[idx][q - 33] += 1

    except KeyboardInterrupt:
        pass

    print "Number of reads:\t%s" % total_reads
    print ""

    mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(lengths)

    print "Length distribution"
    print 'Mean:\t%s' % mean
    print 'StdDev:\t%s' % stdev
    print 'Min:\t%s' % min_val
    print '25 percentile:\t%s' % pct25
    print 'Median:\t%s' % pct50
    print '75 percentile:\t%s' % pct75
    print 'Max:\t%s' % max_val

    if verbose:
        print ''
        for idx, count in enumerate(lengths[::-1]):
            if count:
                print "%s\t%s" % (len(lengths) - idx - 1, count)

    print "Quality distribution"
    print "pos\tmean\tstdev\tmin\t25pct\t50pct\t75pct\tmax"

    for pos, quals in enumerate(posquals):
        if not quals:
            continue
        mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(quals)

        sys.stdout.write('%s\t' % (pos + 1))
        sys.stdout.write('\t'.join([str(x) for x in stats_counts(quals)]))
        sys.stdout.write('\n')

    print ""
    print "Quality by position"

    for i, x in enumerate(qualities):
        q = x / total[i]
        if q > 33:
            sys.stdout.write(chr(q))
        else:
            sys.stdout.write('~')

    if verbose:
        print ''
        for i, q in enumerate(qualities):
            print '[%s] %s' % (i, (q / total[i]) - 33)

    print ''
Ejemplo n.º 23
0
def fastq_convertqual(fname):
    for name, seq, qual in read_fastq(fname):
        sys.stdout.write('@%s\n%s\n+\n%s\n' % (name, seq, ''.join([chr(ord(q) - 31) for q in qual])))
Ejemplo n.º 24
0
def fastq_split(fname,
                outbase,
                chunks,
                ignore_pairs=False,
                gz=False,
                count_fname=None):
    i = 0

    if ignore_pairs:
        is_paired = False
    else:
        is_paired = is_paired_fastq(fname)

    outs = []
    fnames = []
    for i in xrange(chunks):
        if gz:
            fn = '%s.%s.fastq.gz' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn),
                               '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            sys.stderr.write('Output file: %s\n' % fn)
            outs.append(gzip.open(tmp, 'w'))
        else:
            fn = '%s.%s.fastq' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn),
                               '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            sys.stderr.write('Output file: %s\n' % fn)
            outs.append(open(tmp, 'w'))

    i = 0
    last_name = None
    count = 0

    for name, seq, qual in read_fastq(fname):
        count += 1
        sn = name.split()[0]
        if not is_paired:
            i += 1
        elif sn != last_name:
            i += 1

        if i >= len(outs):
            i = 0

        last_name = sn

        outs[i].write('%s\n%s\n+\n%s\n' % (name, seq, qual))

    for out in outs:
        out.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)

    if count_fname:
        with open(count_fname, 'w') as f:
            f.write(count)