Esempio n. 1
0
    def testSplitUnpaired(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.fastq')
        templ = os.path.join(os.path.dirname(__file__), 'test_templ')

        ngsutils.fastq.split.fastq_split(fname,
                                         templ,
                                         2,
                                         ignore_pairs=True,
                                         quiet=True)

        self.assertTrue(os.path.exists('%s.1.fastq' % templ))
        self.assertTrue(os.path.exists('%s.2.fastq' % templ))

        fq1 = FASTQ('%s.1.fastq' % templ)
        fq2 = FASTQ('%s.2.fastq' % templ)

        names1 = [x.name for x in fq1.fetch(quiet=True)]
        self.assertEqual(names1, ['foo', 'bar', 'baz'])

        names2 = [x.name for x in fq2.fetch(quiet=True)]
        self.assertEqual(names2, ['foo', 'bar', 'baz'])

        fq1.close()
        fq2.close()
        os.unlink('%s.1.fastq' % templ)
        os.unlink('%s.2.fastq' % templ)
Esempio n. 2
0
    def testSplitThree(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.fastq')
        templ = os.path.join(os.path.dirname(__file__), 'test_templ')

        ngsutils.fastq.split.fastq_split(fname,
                                         templ,
                                         3,
                                         ignore_pairs=True,
                                         quiet=True)

        self.assertTrue(os.path.exists('%s.1.fastq' % templ))
        self.assertTrue(os.path.exists('%s.2.fastq' % templ))
        self.assertTrue(os.path.exists('%s.3.fastq' % templ))

        fq1 = FASTQ('%s.1.fastq' % templ)
        fq2 = FASTQ('%s.2.fastq' % templ)
        fq3 = FASTQ('%s.3.fastq' % templ)

        names1 = [x.fullname for x in fq1.fetch(quiet=True)]
        self.assertEqual(names1, ['foo /1', 'bar /2'])

        names2 = [x.fullname for x in fq2.fetch(quiet=True)]
        self.assertEqual(names2, ['foo /2', 'baz /1'])

        names3 = [x.fullname for x in fq3.fetch(quiet=True)]
        self.assertEqual(names3, ['bar /1', 'baz /2'])

        fq1.close()
        fq2.close()
        fq3.close()

        os.unlink('%s.1.fastq' % templ)
        os.unlink('%s.2.fastq' % templ)
        os.unlink('%s.3.fastq' % templ)
Esempio n. 3
0
    def testSplitThree(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.fastq')
        templ = os.path.join(os.path.dirname(__file__), 'test_templ')

        ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True)

        self.assertTrue(os.path.exists('%s.1.fastq' % templ))
        self.assertTrue(os.path.exists('%s.2.fastq' % templ))
        self.assertTrue(os.path.exists('%s.3.fastq' % templ))

        fq1 = FASTQ('%s.1.fastq' % templ)
        fq2 = FASTQ('%s.2.fastq' % templ)
        fq3 = FASTQ('%s.3.fastq' % templ)

        names1 = [x.fullname for x in fq1.fetch(quiet=True)]
        self.assertEqual(names1, ['foo /1', 'bar /2'])

        names2 = [x.fullname for x in fq2.fetch(quiet=True)]
        self.assertEqual(names2, ['foo /2', 'baz /1'])

        names3 = [x.fullname for x in fq3.fetch(quiet=True)]
        self.assertEqual(names3, ['bar /1', 'baz /2'])

        fq1.close()
        fq2.close()
        fq3.close()

        os.unlink('%s.1.fastq' % templ)
        os.unlink('%s.2.fastq' % templ)
        os.unlink('%s.3.fastq' % templ)
Esempio n. 4
0
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False):
    fastq = FASTQ(fname)

    outs = []
    fnames = []

    for read in fastq.fetch(quiet=quiet):
        out_idx = 0
        pos = 0
        while pos + length < len(read.seq):
            if len(outs) <= out_idx:
                fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet)
                outs.append(fobj)
                fnames.append((tmp, fn))

            read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write(outs[out_idx])
            pos += offset
            out_idx += 1

    for out in outs:
        out.close()

    fastq.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)
Esempio n. 5
0
def fastq_unmerge(combined_fname, out_template, gz=False):
    outs = []
    if gz:
        outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w'))
    else:
        outs.append(open('%s.1.fastq' % out_template, 'w'))

    outidx = 1

    last_read = None
    fq = FASTQ(combined_fname)
    for read in fq.fetch():
        if last_read and last_read.name == read.name:
            outidx += 1
            if len(outs) < outidx:
                if gz:
                    outs.append(gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w'))
                else:
                    outs.append(open('%s.%s.fastq' % (out_template, outidx), 'w'))
            read.write(outs[outidx - 1])
        else:
            outidx = 1
            read.write(outs[0])

        last_read = read

    fq.close()
    for out in outs:
        out.close()
Esempio n. 6
0
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False):
    fastq = FASTQ(fname)

    outs = []
    fnames = []

    for read in fastq.fetch(quiet=quiet):
        out_idx = 0
        pos = 0
        while pos + length < len(read.seq):
            if len(outs) <= out_idx:
                fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet)
                outs.append(fobj)
                fnames.append((tmp, fn))

            read.subseq(pos,
                        pos + length,
                        comment="#tile:%s,%s" % (pos, pos + length)).write(
                            outs[out_idx])
            pos += offset
            out_idx += 1

    for out in outs:
        out.close()

    fastq.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)
Esempio n. 7
0
def fastq_unmerge(combined_fname, out_template, gz=False):
    outs = []
    if gz:
        outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w'))
    else:
        outs.append(open('%s.1.fastq' % out_template, 'w'))

    outidx = 1

    last_read = None
    fq = FASTQ(combined_fname)
    for read in fq.fetch():
        if last_read and last_read.name == read.name:
            outidx += 1
            if len(outs) < outidx:
                if gz:
                    outs.append(
                        gzip.open('%s.%s.fastq.gz' % (out_template, outidx),
                                  'w'))
                else:
                    outs.append(
                        open('%s.%s.fastq' % (out_template, outidx), 'w'))
            read.write(outs[outidx - 1])
        else:
            outidx = 1
            read.write(outs[0])

        last_read = read

    fq.close()
    for out in outs:
        out.close()
Esempio n. 8
0
def fastq_split(fname,
                outbase,
                chunks,
                ignore_pairs=False,
                gz=False,
                count_fname=None,
                quiet=False):
    fastq = FASTQ(fname)

    if ignore_pairs:
        is_paired = False
    else:
        is_paired = fastq.is_paired

    outs = []
    fnames = []
    for i in xrange(chunks):
        if gz:
            fn = '%s.%s.fastq.gz' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn),
                               '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            if not quiet:
                sys.stderr.write('Output file: %s\n' % fn)
            outs.append(gzip.open(tmp, 'w'))
        else:
            fn = '%s.%s.fastq' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn),
                               '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            if not quiet:
                sys.stderr.write('Output file: %s\n' % fn)
            outs.append(open(tmp, 'w'))

    i = chunks
    last_name = None

    for read in fastq.fetch(quiet=quiet):
        if not is_paired:
            i += 1
        elif read.name != last_name:
            i += 1

        if i >= len(outs):
            i = 0

        last_name = read.name

        read.write(outs[i])

    for out in outs:
        out.close()

    fastq.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)
Esempio n. 9
0
    def testSplitUnpaired(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.fastq')
        templ = os.path.join(os.path.dirname(__file__), 'test_templ')

        ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True)

        self.assertTrue(os.path.exists('%s.1.fastq' % templ))
        self.assertTrue(os.path.exists('%s.2.fastq' % templ))

        fq1 = FASTQ('%s.1.fastq' % templ)
        fq2 = FASTQ('%s.2.fastq' % templ)

        names1 = [x.name for x in fq1.fetch(quiet=True)]
        self.assertEqual(names1, ['foo', 'bar', 'baz'])

        names2 = [x.name for x in fq2.fetch(quiet=True)]
        self.assertEqual(names2, ['foo', 'bar', 'baz'])

        fq1.close()
        fq2.close()
        os.unlink('%s.1.fastq' % templ)
        os.unlink('%s.2.fastq' % templ)
Esempio n. 10
0
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False):
    fastq = FASTQ(fname)

    if ignore_pairs:
        is_paired = False
    else:
        is_paired = fastq.is_paired

    outs = []
    fnames = []
    for i in xrange(chunks):
        if gz:
            fn = '%s.%s.fastq.gz' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            if not quiet:
                sys.stderr.write('Output file: %s\n' % fn)
            outs.append(gzip.open(tmp, 'w'))
        else:
            fn = '%s.%s.fastq' % (outbase, i + 1)
            tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn))
            fnames.append((tmp, fn))

            if not quiet:
                sys.stderr.write('Output file: %s\n' % fn)
            outs.append(open(tmp, 'w'))

    i = chunks
    last_name = None

    for read in fastq.fetch(quiet=quiet):
        if not is_paired:
            i += 1
        elif read.name != last_name:
            i += 1

        if i >= len(outs):
            i = 0

        last_name = read.name

        read.write(outs[i])

    for out in outs:
        out.close()

    fastq.close()

    for tmp, fname in fnames:
        os.rename(tmp, fname)
Esempio n. 11
0
    def testFQRead(self):
        fq = StringIO.StringIO('''\
@foo
ACGTacgtACGT
+
CDEFGHIJKLMN
''')
        out = StringIO.StringIO('')
        ngsutils.fastq.convertqual.fastq_convertqual(FASTQ(fileobj=fq), out=out, quiet=True)

        out.seek(0)
        fqout = FASTQ(fileobj=out)
        read = fqout.fetch().next()
        self.assertEqual(read.name, 'foo')
        self.assertEqual(read.seq, 'ACGTacgtACGT')
        self.assertEqual(read.qual, "$%&'()*+,-./")
Esempio n. 12
0
    def testFQRead(self):
        fq = StringIO.StringIO('''\
@foo
ACGTacgtACGT
+
CDEFGHIJKLMN
''')
        out = StringIO.StringIO('')
        ngsutils.fastq.convertqual.fastq_convertqual(FASTQ(fileobj=fq),
                                                     out=out,
                                                     quiet=True)

        out.seek(0)
        fqout = FASTQ(fileobj=out)
        read = fqout.fetch().next()
        self.assertEqual(read.name, 'foo')
        self.assertEqual(read.seq, 'ACGTacgtACGT')
        self.assertEqual(read.qual, "$%&'()*+,-./")
Esempio n. 13
0
    def assert_fastq_contains(self, base, args):
        for tag in args:
            valid = args[tag][0].split()
            seq_qual = {}
            if args[tag][1]:
                for n, s, q in zip(valid, args[tag][1].split(), args[tag][2].split()):
                    seq_qual[n] = (s, q)

            fq = FASTQ(base % tag)
            count = 0
            for read in fq.fetch():
                if read.name in valid:
                    count += 1
                    if seq_qual:
                        self.assertEqual(seq_qual[read.name], (read.seq, read.qual))
                else:
                    self.assertEqual('extra read in %s' % tag, read.name)

            self.assertEqual(count, len(valid))
Esempio n. 14
0
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False):
    tmp1 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_fname = tmp1.name
    tmp1_out = gzip.GzipFile(fileobj=tmp1)

    ngsutils.fastq.sort.fastq_sort(fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_out.close()
    tmp1.close()

    tmp2 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_fname = tmp2.name
    tmp2_out = gzip.GzipFile(fileobj=tmp2)

    ngsutils.fastq.sort.fastq_sort(fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_out.close()
    tmp2.close()

    sys.stderr.write('Finding properly paired FASTQ reads...\n')

    fq_tmp1 = FASTQ(tmp1_fname)
    fq_tmp2 = FASTQ(tmp2_fname)

    reader1 = fq_tmp1.fetch(quiet=quiet)
    reader2 = fq_tmp2.fetch(quiet=True)

    read1 = reader1.next()
    read2 = reader2.next()

    pairs = 0
    discarded_1 = 0
    discarded_2 = 0

    while read1 and read2:
        if read1.name == read2.name:
            read1.write(out1)
            read2.write(out2)

            try:
                read1 = reader1.next()
                read2 = reader2.next()
            except StopIteration:
                break

            pairs += 1
        elif read1.name < read2.name:
            discarded_1 += 1
            try:
                read1 = reader1.next()
            except StopIteration:
                break
        else:
            discarded_2 += 1
            try:
                read2 = reader2.next()
            except StopIteration:
                break

    fq_tmp1.close()
    fq_tmp2.close()

    os.unlink(tmp1_fname)
    os.unlink(tmp2_fname)

    return pairs, discarded_1, discarded_2
Esempio n. 15
0
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False):
    tmp1 = tempfile.NamedTemporaryFile(
        delete=False,
        prefix='.tmp',
        suffix='.gz',
        dir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_fname = tmp1.name
    tmp1_out = gzip.GzipFile(fileobj=tmp1)

    ngsutils.fastq.sort.fastq_sort(
        fq1,
        out=tmp1_out,
        tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname))
    tmp1_out.close()
    tmp1.close()

    tmp2 = tempfile.NamedTemporaryFile(
        delete=False,
        prefix='.tmp',
        suffix='.gz',
        dir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_fname = tmp2.name
    tmp2_out = gzip.GzipFile(fileobj=tmp2)

    ngsutils.fastq.sort.fastq_sort(
        fq2,
        out=tmp2_out,
        tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname))
    tmp2_out.close()
    tmp2.close()

    sys.stderr.write('Finding properly paired FASTQ reads...\n')

    fq_tmp1 = FASTQ(tmp1_fname)
    fq_tmp2 = FASTQ(tmp2_fname)

    reader1 = fq_tmp1.fetch(quiet=quiet)
    reader2 = fq_tmp2.fetch(quiet=True)

    read1 = reader1.next()
    read2 = reader2.next()

    pairs = 0
    discarded_1 = 0
    discarded_2 = 0

    while read1 and read2:
        if read1.name == read2.name:
            read1.write(out1)
            read2.write(out2)

            try:
                read1 = reader1.next()
                read2 = reader2.next()
            except StopIteration:
                break

            pairs += 1
        elif read1.name < read2.name:
            discarded_1 += 1
            try:
                read1 = reader1.next()
            except StopIteration:
                break
        else:
            discarded_2 += 1
            try:
                read2 = reader2.next()
            except StopIteration:
                break

    fq_tmp1.close()
    fq_tmp2.close()

    os.unlink(tmp1_fname)
    os.unlink(tmp2_fname)

    return pairs, discarded_1, discarded_2