def testSplitUnpaired(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) names1 = [x.name for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo', 'bar', 'baz']) names2 = [x.name for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo', 'bar', 'baz']) fq1.close() fq2.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ)
def testSplitThree(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) self.assertTrue(os.path.exists('%s.3.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) fq3 = FASTQ('%s.3.fastq' % templ) names1 = [x.fullname for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo /1', 'bar /2']) names2 = [x.fullname for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo /2', 'baz /1']) names3 = [x.fullname for x in fq3.fetch(quiet=True)] self.assertEqual(names3, ['bar /1', 'baz /2']) fq1.close() fq2.close() fq3.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ) os.unlink('%s.3.fastq' % templ)
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False): fastq = FASTQ(fname) outs = [] fnames = [] for read in fastq.fetch(quiet=quiet): out_idx = 0 pos = 0 while pos + length < len(read.seq): if len(outs) <= out_idx: fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet) outs.append(fobj) fnames.append((tmp, fn)) read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write(outs[out_idx]) pos += offset out_idx += 1 for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def fastq_unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append(gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append(open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False): fastq = FASTQ(fname) outs = [] fnames = [] for read in fastq.fetch(quiet=quiet): out_idx = 0 pos = 0 while pos + length < len(read.seq): if len(outs) <= out_idx: fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet) outs.append(fobj) fnames.append((tmp, fn)) read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write( outs[out_idx]) pos += offset out_idx += 1 for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def fastq_unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append( gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append( open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False): fastq = FASTQ(fname) if ignore_pairs: is_paired = False else: is_paired = fastq.is_paired outs = [] fnames = [] for i in xrange(chunks): if gz: fn = '%s.%s.fastq.gz' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(gzip.open(tmp, 'w')) else: fn = '%s.%s.fastq' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(open(tmp, 'w')) i = chunks last_name = None for read in fastq.fetch(quiet=quiet): if not is_paired: i += 1 elif read.name != last_name: i += 1 if i >= len(outs): i = 0 last_name = read.name read.write(outs[i]) for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def testFQRead(self): fq = StringIO.StringIO('''\ @foo ACGTacgtACGT + CDEFGHIJKLMN ''') out = StringIO.StringIO('') ngsutils.fastq.convertqual.fastq_convertqual(FASTQ(fileobj=fq), out=out, quiet=True) out.seek(0) fqout = FASTQ(fileobj=out) read = fqout.fetch().next() self.assertEqual(read.name, 'foo') self.assertEqual(read.seq, 'ACGTacgtACGT') self.assertEqual(read.qual, "$%&'()*+,-./")
def assert_fastq_contains(self, base, args): for tag in args: valid = args[tag][0].split() seq_qual = {} if args[tag][1]: for n, s, q in zip(valid, args[tag][1].split(), args[tag][2].split()): seq_qual[n] = (s, q) fq = FASTQ(base % tag) count = 0 for read in fq.fetch(): if read.name in valid: count += 1 if seq_qual: self.assertEqual(seq_qual[read.name], (read.seq, read.qual)) else: self.assertEqual('extra read in %s' % tag, read.name) self.assertEqual(count, len(valid))
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False): tmp1 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_fname = tmp1.name tmp1_out = gzip.GzipFile(fileobj=tmp1) ngsutils.fastq.sort.fastq_sort(fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_out.close() tmp1.close() tmp2 = tempfile.NamedTemporaryFile(delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_fname = tmp2.name tmp2_out = gzip.GzipFile(fileobj=tmp2) ngsutils.fastq.sort.fastq_sort(fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_out.close() tmp2.close() sys.stderr.write('Finding properly paired FASTQ reads...\n') fq_tmp1 = FASTQ(tmp1_fname) fq_tmp2 = FASTQ(tmp2_fname) reader1 = fq_tmp1.fetch(quiet=quiet) reader2 = fq_tmp2.fetch(quiet=True) read1 = reader1.next() read2 = reader2.next() pairs = 0 discarded_1 = 0 discarded_2 = 0 while read1 and read2: if read1.name == read2.name: read1.write(out1) read2.write(out2) try: read1 = reader1.next() read2 = reader2.next() except StopIteration: break pairs += 1 elif read1.name < read2.name: discarded_1 += 1 try: read1 = reader1.next() except StopIteration: break else: discarded_2 += 1 try: read2 = reader2.next() except StopIteration: break fq_tmp1.close() fq_tmp2.close() os.unlink(tmp1_fname) os.unlink(tmp2_fname) return pairs, discarded_1, discarded_2
def find_fastq_pairs(fq1, fq2, out1, out2, tmpdir=None, quiet=False): tmp1 = tempfile.NamedTemporaryFile( delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_fname = tmp1.name tmp1_out = gzip.GzipFile(fileobj=tmp1) ngsutils.fastq.sort.fastq_sort( fq1, out=tmp1_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq1.fname)) tmp1_out.close() tmp1.close() tmp2 = tempfile.NamedTemporaryFile( delete=False, prefix='.tmp', suffix='.gz', dir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_fname = tmp2.name tmp2_out = gzip.GzipFile(fileobj=tmp2) ngsutils.fastq.sort.fastq_sort( fq2, out=tmp2_out, tmpdir=tmpdir if tmpdir else os.path.dirname(fq2.fname)) tmp2_out.close() tmp2.close() sys.stderr.write('Finding properly paired FASTQ reads...\n') fq_tmp1 = FASTQ(tmp1_fname) fq_tmp2 = FASTQ(tmp2_fname) reader1 = fq_tmp1.fetch(quiet=quiet) reader2 = fq_tmp2.fetch(quiet=True) read1 = reader1.next() read2 = reader2.next() pairs = 0 discarded_1 = 0 discarded_2 = 0 while read1 and read2: if read1.name == read2.name: read1.write(out1) read2.write(out2) try: read1 = reader1.next() read2 = reader2.next() except StopIteration: break pairs += 1 elif read1.name < read2.name: discarded_1 += 1 try: read1 = reader1.next() except StopIteration: break else: discarded_2 += 1 try: read2 = reader2.next() except StopIteration: break fq_tmp1.close() fq_tmp2.close() os.unlink(tmp1_fname) os.unlink(tmp2_fname) return pairs, discarded_1, discarded_2