Esempio n. 1
0
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False):
    fastq = FASTQ(fname)

    outs = []
    fnames = []

    for read in fastq.fetch(quiet=quiet):
        out_idx = 0
        pos = 0
        while pos + length < len(read.seq):
            if len(outs) <= out_idx:
                fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet)
                outs.append(fobj)
                fnames.append((tmp, fn))

            read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write(outs[out_idx])
            pos += offset
            out_idx += 1

    for out in outs:
        out.close()

    fastq.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)
Esempio n. 2
0
def fastq_unmerge(combined_fname, out_template, gz=False):
    outs = []
    if gz:
        outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w'))
    else:
        outs.append(open('%s.1.fastq' % out_template, 'w'))

    outidx = 1

    last_read = None
    fq = FASTQ(combined_fname)
    for read in fq.fetch():
        if last_read and last_read.name == read.name:
            outidx += 1
            if len(outs) < outidx:
                if gz:
                    outs.append(gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w'))
                else:
                    outs.append(open('%s.%s.fastq' % (out_template, outidx), 'w'))
            read.write(outs[outidx - 1])
        else:
            outidx = 1
            read.write(outs[0])

        last_read = read

    fq.close()
    for out in outs:
        out.close()
Esempio n. 3
0
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False):
    fastq = FASTQ(fname)

    if ignore_pairs:
        is_paired = False
    else:
        is_paired = fastq.is_paired

    outs = []
    fnames = []
    for i in xrange(chunks):
        if gz:
            fn = '%s.%s.fastq.gz' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            if not quiet:
                sys.stderr.write('Output file: %s\n' % fn)
            outs.append(gzip.open(tmp, 'w'))
        else:
            fn = '%s.%s.fastq' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            if not quiet:
                sys.stderr.write('Output file: %s\n' % fn)
            outs.append(open(tmp, 'w'))

    i = chunks
    last_name = None

    for read in fastq.fetch(quiet=quiet):
        if not is_paired:
            i += 1
        elif read.name != last_name:
            i += 1

        if i >= len(outs):
            i = 0

        last_name = read.name

        read.write(outs[i])

    for out in outs:
        out.close()

    fastq.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)
Esempio n. 4
0
    def testFQRead(self):
        fq = StringIO.StringIO('''\
@foo
ACGTacgtACGT
+
CDEFGHIJKLMN
''')
        out = StringIO.StringIO('')
        ngsutils.fastq.convertqual.fastq_convertqual(FASTQ(fileobj=fq), out=out, quiet=True)

        out.seek(0)
        fqout = FASTQ(fileobj=out)
        read = fqout.fetch().next()
        self.assertEqual(read.name, 'foo')
        self.assertEqual(read.seq, 'ACGTacgtACGT')
        self.assertEqual(read.qual, "$%&'()*+,-./")
Esempio n. 5
0
    def assert_fastq_contains(self, base, args):
        for tag in args:
            valid = args[tag][0].split()
            seq_qual = {}
            if args[tag][1]:
                for n, s, q in zip(valid, args[tag][1].split(), args[tag][2].split()):
                    seq_qual[n] = (s, q)

            fq = FASTQ(base % tag)
            count = 0
            for read in fq.fetch():
                if read.name in valid:
                    count += 1
                    if seq_qual:
                        self.assertEqual(seq_qual[read.name], (read.seq, read.qual))
                else:
                    self.assertEqual('extra read in %s' % tag, read.name)

            self.assertEqual(count, len(valid))
Esempio n. 6
0
    def testSplitUnpaired(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.fastq')
        templ = os.path.join(os.path.dirname(__file__), 'test_templ')

        ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True)

        self.assertTrue(os.path.exists('%s.1.fastq' % templ))
        self.assertTrue(os.path.exists('%s.2.fastq' % templ))

        fq1 = FASTQ('%s.1.fastq' % templ)
        fq2 = FASTQ('%s.2.fastq' % templ)

        names1 = [x.name for x in fq1.fetch(quiet=True)]
        self.assertEqual(names1, ['foo', 'bar', 'baz'])

        names2 = [x.name for x in fq2.fetch(quiet=True)]
        self.assertEqual(names2, ['foo', 'bar', 'baz'])

        fq1.close()
        fq2.close()
        os.unlink('%s.1.fastq' % templ)
        os.unlink('%s.2.fastq' % templ)
Esempio n. 7
0
    outname = None
    read1_fname = None
    read2_fname = None

    force = False

    for arg in sys.argv[1:]:
        if arg == "-f":
            force = True
        elif not outname:
            if not force and os.path.exists(arg):
                usage("Output file exists! (Use -f to force overwriting): %s" % arg)
            outname = arg
        elif not read1_fname and os.path.exists(arg):
            read1_fname = arg
        elif not read2_fname and os.path.exists(arg):
            read2_fname = arg

    if not outname or not read1_fname:
        usage()

    read1 = FASTQ(read1_fname)
    read2 = FASTQ(read2_fname) if read2_fname else None

    bam = pysam.Samfile(outname, "wb")
    export_bam(bam, read1, read2)
    bam.close()

    read1.close()
    read2.close()
Esempio n. 8
0
            if not os.path.exists(arg):
                usage("File %s doesn't exist!" % arg)
            fqname2 = arg
        elif not outname1:
            if os.path.exists(arg):
                usage("File %s exists!" % arg)
            outname1 = arg
        elif not outname2:
            if os.path.exists(arg):
                usage("File %s exists!" % arg)
            outname2 = arg

    if not fqname1 or not fqname2 or not outname1 or not outname2:
        usage()

    fq1 = FASTQ(fqname1)
    fq2 = FASTQ(fqname2)

    if gz:
        out1 = gzip.open(outname1, 'w')
        out2 = gzip.open(outname2, 'w')
    else:
        out1 = open(outname1, 'w')
        out2 = open(outname2, 'w')

    total1, total2, matched = find_fastq_pairs(fq1, fq2, out1, out2)

    print "Totals: %s, %s" % (total1, total2)
    print "Proper pairs: %s" % matched

    fq1.close()
Esempio n. 9
0
            fname = arg

    if not fname or not filters_config:
        usage()

    discard = None
    _d_file = None
    if discard_fname:
        _d_file = open(discard_fname, 'w')

        def _callback(name):
            _d_file.write('%s\n' % name[1:])

        discard = _callback

    fq = FASTQ(fname)

    chain = FASTQReader(fq, veryverbose)
    for config in filters_config:
        if verbose:
            sys.stderr.write(config[0].__name__)
            sys.stderr.write('\t%s\n' % '\t'.join([str(x) for x in config[1:]]))

        clazz = config[0]
        opts = config[1:]

        if clazz == QualFilter:
            chain = clazz(chain, *opts, verbose=veryverbose, discard=discard, illumina=illumina)
        else:
            chain = clazz(chain, *opts, verbose=veryverbose, discard=discard)
Esempio n. 10
0
    def testSplitThree(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.fastq')
        templ = os.path.join(os.path.dirname(__file__), 'test_templ')

        ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True)

        self.assertTrue(os.path.exists('%s.1.fastq' % templ))
        self.assertTrue(os.path.exists('%s.2.fastq' % templ))
        self.assertTrue(os.path.exists('%s.3.fastq' % templ))

        fq1 = FASTQ('%s.1.fastq' % templ)
        fq2 = FASTQ('%s.2.fastq' % templ)
        fq3 = FASTQ('%s.3.fastq' % templ)

        names1 = [x.fullname for x in fq1.fetch(quiet=True)]
        self.assertEqual(names1, ['foo /1', 'bar /2'])

        names2 = [x.fullname for x in fq2.fetch(quiet=True)]
        self.assertEqual(names2, ['foo /2', 'baz /1'])

        names3 = [x.fullname for x in fq3.fetch(quiet=True)]
        self.assertEqual(names3, ['bar /1', 'baz /2'])

        fq1.close()
        fq2.close()
        fq3.close()

        os.unlink('%s.1.fastq' % templ)
        os.unlink('%s.2.fastq' % templ)
        os.unlink('%s.3.fastq' % templ)
Esempio n. 11
0
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False):
    tmp1 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_fname = tmp1.name
    tmp1_out = gzip.GzipFile(fileobj=tmp1)

    ngsutils.fastq.sort.fastq_sort(fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_out.close()
    tmp1.close()

    tmp2 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_fname = tmp2.name
    tmp2_out = gzip.GzipFile(fileobj=tmp2)

    ngsutils.fastq.sort.fastq_sort(fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_out.close()
    tmp2.close()

    sys.stderr.write('Finding properly paired FASTQ reads...\n')

    fq_tmp1 = FASTQ(tmp1_fname)
    fq_tmp2 = FASTQ(tmp2_fname)

    reader1 = fq_tmp1.fetch(quiet=quiet)
    reader2 = fq_tmp2.fetch(quiet=True)

    read1 = reader1.next()
    read2 = reader2.next()

    pairs = 0
    discarded_1 = 0
    discarded_2 = 0

    while read1 and read2:
        if read1.name == read2.name:
            read1.write(out1)
            read2.write(out2)

            try:
                read1 = reader1.next()
                read2 = reader2.next()
            except StopIteration:
                break

            pairs += 1
        elif read1.name < read2.name:
            discarded_1 += 1
            try:
                read1 = reader1.next()
            except StopIteration:
                break
        else:
            discarded_2 += 1
            try:
                read2 = reader2.next()
            except StopIteration:
                break

    fq_tmp1.close()
    fq_tmp2.close()

    os.unlink(tmp1_fname)
    os.unlink(tmp2_fname)

    return pairs, discarded_1, discarded_2
Esempio n. 12
0
            read.clone(name=name, comment=comment).write(out)


def usage():
    print __doc__
    print """Usage: fastqutils merge {-slash} file1.fastq{.gz} file2.fastq{.gz} ...

-slash    Split the read name at a '/' (Illumina paired format)
"""
    sys.exit(1)


if __name__ == '__main__':
    fnames = []
    split_slashes = False
    for arg in sys.argv[1:]:
        if arg == '-slash':
            split_slashes = True
        elif os.path.exists(arg):
            fnames.append(arg)

    if len(fnames) < 2:
        usage()

    fastqs = [FASTQ(x) for x in fnames]

    fastq_merge(fastqs, split_slashes)

    for fq in fastqs:
        fq.close()
Esempio n. 13
0
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False):
    tmp1 = tempfile.NamedTemporaryFile(
        delete=False,
        prefix='.tmp',
        suffix='.gz',
        dir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_fname = tmp1.name
    tmp1_out = gzip.GzipFile(fileobj=tmp1)

    ngsutils.fastq.sort.fastq_sort(
        fq1,
        out=tmp1_out,
        tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_out.close()
    tmp1.close()

    tmp2 = tempfile.NamedTemporaryFile(
        delete=False,
        prefix='.tmp',
        suffix='.gz',
        dir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_fname = tmp2.name
    tmp2_out = gzip.GzipFile(fileobj=tmp2)

    ngsutils.fastq.sort.fastq_sort(
        fq2,
        out=tmp2_out,
        tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_out.close()
    tmp2.close()

    sys.stderr.write('Finding properly paired FASTQ reads...\n')

    fq_tmp1 = FASTQ(tmp1_fname)
    fq_tmp2 = FASTQ(tmp2_fname)

    reader1 = fq_tmp1.fetch(quiet=quiet)
    reader2 = fq_tmp2.fetch(quiet=True)

    read1 = reader1.next()
    read2 = reader2.next()

    pairs = 0
    discarded_1 = 0
    discarded_2 = 0

    while read1 and read2:
        if read1.name == read2.name:
            read1.write(out1)
            read2.write(out2)

            try:
                read1 = reader1.next()
                read2 = reader2.next()
            except StopIteration:
                break

            pairs += 1
        elif read1.name < read2.name:
            discarded_1 += 1
            try:
                read1 = reader1.next()
            except StopIteration:
                break
        else:
            discarded_2 += 1
            try:
                read2 = reader2.next()
            except StopIteration:
                break

    fq_tmp1.close()
    fq_tmp2.close()

    os.unlink(tmp1_fname)
    os.unlink(tmp2_fname)

    return pairs, discarded_1, discarded_2
Esempio n. 14
0
                usage("File %s doesn't exist!" % arg)
            fqname2 = arg
        elif not outname1:
            outname1 = arg
        elif not outname2:
            outname2 = arg

    if not fqname1 or not fqname2 or not outname1 or not outname2:
        usage()

    if not force:
        for fname in [outname1, outname2]:
            if os.path.exists(fname):
                usage("File %s exists!" % fname)

    fq1 = FASTQ(fqname1)
    fq2 = FASTQ(fqname2)

    if gz:
        out1 = gzip.open(outname1, 'w')
        out2 = gzip.open(outname2, 'w')
    else:
        out1 = open(outname1, 'w')
        out2 = open(outname2, 'w')

    paired, discard_1, discard_2 = find_fastq_pairs(fq1, fq2, out1, out2,
                                                    tmpdir)

    print "Proper pairs: %s" % paired
    print "Discarded 1 : %s" % discard_1
    print "Discarded 2 : %s" % discard_2
Esempio n. 15
0
    for read in fastq.fetch(quiet=quiet):
        if include_comment:
            out.write('%s%s%s\n' %
                      (read.name, ' ' if read.comment else '', read.comment))
        else:
            out.write('%s\n' % read.name)


def usage():
    print __doc__
    print "Usage: fastqutils names {-comment} filename.fastq{.gz}"
    sys.exit(1)


if __name__ == '__main__':
    fname = None
    include_comment = False

    for arg in sys.argv[1:]:
        if arg == '-comment':
            include_comment = True
        elif os.path.exists(arg):
            fname = arg

    if not fname:
        usage()

    fq = FASTQ(fname)
    export_names(fq, include_comment)
    fq.close()