def testSplit(self): fq1 = StringIO.StringIO('''\ @foo/1 comment1 ACGTACGT + ;;;;;;;; ''') fq2 = StringIO.StringIO('''\ @foo/2 comment2 acgtacgt + AAAAAAAA ''') out = StringIO.StringIO('') ngsutils.fastq.merge.fastq_merge( [FASTQ(fileobj=fq1), FASTQ(fileobj=fq2)], split_slashes=True, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo /1 comment1 ACGTACGT + ;;;;;;;; @foo /2 comment2 acgtacgt + AAAAAAAA ''')
def testSplitThree(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 3, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) self.assertTrue(os.path.exists('%s.3.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) fq3 = FASTQ('%s.3.fastq' % templ) names1 = [x.fullname for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo /1', 'bar /2']) names2 = [x.fullname for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo /2', 'baz /1']) names3 = [x.fullname for x in fq3.fetch(quiet=True)] self.assertEqual(names3, ['bar /1', 'baz /2']) fq1.close() fq2.close() fq3.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ) os.unlink('%s.3.fastq' % templ)
def testSplitUnpaired(self): fname = os.path.join(os.path.dirname(__file__), 'test.fastq') templ = os.path.join(os.path.dirname(__file__), 'test_templ') ngsutils.fastq.split.fastq_split(fname, templ, 2, ignore_pairs=True, quiet=True) self.assertTrue(os.path.exists('%s.1.fastq' % templ)) self.assertTrue(os.path.exists('%s.2.fastq' % templ)) fq1 = FASTQ('%s.1.fastq' % templ) fq2 = FASTQ('%s.2.fastq' % templ) names1 = [x.name for x in fq1.fetch(quiet=True)] self.assertEqual(names1, ['foo', 'bar', 'baz']) names2 = [x.name for x in fq2.fetch(quiet=True)] self.assertEqual(names2, ['foo', 'bar', 'baz']) fq1.close() fq2.close() os.unlink('%s.1.fastq' % templ) os.unlink('%s.2.fastq' % templ)
def testMissingBoth(self): fq1 = StringIO.StringIO('''\ @foo comment ACGTACGT + ;;;;;;;; @bar ACGTACGT + ;;;;;;;; @quux ACGTACGT + ;;;;;;;; ''') fq2 = StringIO.StringIO('''\ @foo comment ACGTACGT + ;;;;;;;; @baz ACGTACGT + ;;;;;;;; @quux ACGTACGT + ;;;;;;;; ''') out1 = StringIO.StringIO('') out2 = StringIO.StringIO('') fastq1 = FASTQ(fileobj=fq1) fastq2 = FASTQ(fileobj=fq2) ngsutils.fastq.properpairs.find_fastq_pairs(fastq1, fastq2, out1, out2, quiet=True) self.assertEqual(out1.getvalue(), out2.getvalue()) self.assertEqual( out1.getvalue(), '''\ @foo comment ACGTACGT + ;;;;;;;; @quux ACGTACGT + ;;;;;;;; ''')
def fastq_unmerge(combined_fname, out_template, gz=False): outs = [] if gz: outs.append(gzip.open('%s.1.fastq.gz' % out_template, 'w')) else: outs.append(open('%s.1.fastq' % out_template, 'w')) outidx = 1 last_read = None fq = FASTQ(combined_fname) for read in fq.fetch(): if last_read and last_read.name == read.name: outidx += 1 if len(outs) < outidx: if gz: outs.append( gzip.open('%s.%s.fastq.gz' % (out_template, outidx), 'w')) else: outs.append( open('%s.%s.fastq' % (out_template, outidx), 'w')) read.write(outs[outidx - 1]) else: outidx = 1 read.write(outs[0]) last_read = read fq.close() for out in outs: out.close()
def testFilterQualIllumina(self): fq = StringIO.StringIO('''\ @foo ACGTACGTACGTACGT + JJJJJJJJJEEEEJJJ @bar ACGTACGTA + JJJJJEEJJ ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.QualFilter(chain, 7, 5, illumina=True, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo #qual ACGTACGTACGT + JJJJJJJJJEEE @bar ACGTACGTA + JJJJJEEJJ ''')
def testTrimMinOffset(self): fq = StringIO.StringIO('''\ @foo aaaaaTTGCaaccggttccttggaa + ;;;;;;;;;;;;;;;;;;;;;;;;; @bar TTGCtgtgatagctacgactaaaacc + ;;;;;;;;;;;;;;;;;;;;;;;;;; ''') out = StringIO.StringIO('') ngsutils.fastq.trim.fastq_trim(FASTQ(fileobj=fq), linker_5='TTGC', min_len=20, quiet=True, out=out) self.assertEqual( out.getvalue(), '''\ @foo aaaaaTTGCaaccggttccttggaa + ;;;;;;;;;;;;;;;;;;;;;;;;; @bar tgtgatagctacgactaaaacc + ;;;;;;;;;;;;;;;;;;;;;; ''')
def testTrimPctIdentity0_8(self): fq = StringIO.StringIO('''\ @foo TTGCCaaccggttccttagaa + ;;;;;;;;;;;;;;;;;;;;; @bar TTGGCtgtgatagctacgactaaaacc + ;;;;;;;;;;;;;;;;;;;;;;;;;;; ''') out = StringIO.StringIO('') ngsutils.fastq.trim.fastq_trim(FASTQ(fileobj=fq), linker_5='TTGGC', min_len=10, pct_identity=0.75, quiet=True, out=out) self.assertEqual( out.getvalue(), '''\ @foo aaccggttccttagaa + ;;;;;;;;;;;;;;;; @bar tgtgatagctacgactaaaacc + ;;;;;;;;;;;;;;;;;;;;;; ''')
def testTrimCS(self): # Note: colorspace trimming should really only be applicable to the 3' end fq = StringIO.StringIO('''\ @foo T012301231231013101112231 + ;;;;;;;;;;;;;;;;;;;;;;;; @bar T012301231231013101112111 + ;;;;;;;;;;;;;;;;;;;;;;;; ''') out = StringIO.StringIO('') ngsutils.fastq.trim.fastq_trim(FASTQ(fileobj=fq), linker_3='112231', min_len=10, quiet=True, out=out) self.assertEqual( out.getvalue(), '''\ @foo T012301231231013101 + ;;;;;;;;;;;;;;;;;; @bar T012301231231013101112111 + ;;;;;;;;;;;;;;;;;;;;;;;; ''')
def testFilterTrimRepeat(self): '''properly trim repeats''' fq = StringIO.StringIO('''\ @foo ACGTACGTAAAAAAAAAA + ;;;;;;;;;;;;;;;;;; @bar CCGGATTAGGCCCAAA + ;;;;;;;;;;;;;;;; ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.TrimFilter(chain, 'AAAAAAAAAAAAAAAAAA', 1.0, 4, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo #trim ACGTACGT + ;;;;;;;; @bar CCGGATTAGGCCCAAA + ;;;;;;;;;;;;;;;; ''')
def testFilterSuffixQual(self): fq = StringIO.StringIO('''\ @foo ACGTACGTACGTATTT + ;;;;;;;;;;;;!!!! @bar ACGTACGTA + ;;;;;;;;! ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.SuffixQualFilter(chain, '!', verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo #suff ACGTACGTACGT + ;;;;;;;;;;;; @bar #suff ACGTACGT + ;;;;;;;; ''')
def fastq_tile(fname, outbase, length, offset, gz=False, quiet=False): fastq = FASTQ(fname) outs = [] fnames = [] for read in fastq.fetch(quiet=quiet): out_idx = 0 pos = 0 while pos + length < len(read.seq): if len(outs) <= out_idx: fobj, tmp, fn = _open_file(outbase, out_idx, gz, quiet) outs.append(fobj) fnames.append((tmp, fn)) read.subseq(pos, pos + length, comment="#tile:%s,%s" % (pos, pos + length)).write( outs[out_idx]) pos += offset out_idx += 1 for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def testFilterDiscard(self): fq = StringIO.StringIO('''\ @foo ACGTACGTACGTACGT + ;;;;;;;;;;;;;;;; @bar ACGTACGT + ;;;;;;;; ''') discarded = [] def _discard(name): discarded.append(name) out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.SizeFilter(chain, 12, verbose=False, discard=_discard) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual(discarded, ['bar'])
def testFilterQual(self): fq = StringIO.StringIO('''\ @foo comment ACGTACGTACGTACGT + +++++++++&&&&+++ @bar ACGTACGTA + +++++&&++ ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.QualFilter(chain, 7, 5, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo comment #qual ACGTACGTACGT + +++++++++&&& @bar ACGTACGTA + +++++&&++ ''')
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None, quiet=False): fastq = FASTQ(fname) if ignore_pairs: is_paired = False else: is_paired = fastq.is_paired outs = [] fnames = [] for i in xrange(chunks): if gz: fn = '%s.%s.fastq.gz' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(gzip.open(tmp, 'w')) else: fn = '%s.%s.fastq' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) if not quiet: sys.stderr.write('Output file: %s\n' % fn) outs.append(open(tmp, 'w')) i = chunks last_name = None for read in fastq.fetch(quiet=quiet): if not is_paired: i += 1 elif read.name != last_name: i += 1 if i >= len(outs): i = 0 last_name = read.name read.write(outs[i]) for out in outs: out.close() fastq.close() for tmp, fname in fnames: os.rename(tmp, fname)
def testFQRead(self): fq = StringIO.StringIO('''\ @foo ACGTacgtACGT + CDEFGHIJKLMN ''') out = StringIO.StringIO('') ngsutils.fastq.convertqual.fastq_convertqual(FASTQ(fileobj=fq), out=out, quiet=True) out.seek(0) fqout = FASTQ(fileobj=out) read = fqout.fetch().next() self.assertEqual(read.name, 'foo') self.assertEqual(read.seq, 'ACGTacgtACGT') self.assertEqual(read.qual, "$%&'()*+,-./")
def testMerge(self): fq1 = StringIO.StringIO('''\ @foo ACGTACGT + ;;;;;;;; @bar comment ACGTACGT + ;;;;;;;; ''') fq2 = StringIO.StringIO('''\ @foo acgtacgt + AAAAAAAA @bar comment acgtacgt + AAAAAAAA ''') out = StringIO.StringIO('') ngsutils.fastq.merge.fastq_merge( [FASTQ(fileobj=fq1), FASTQ(fileobj=fq2)], out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo ACGTACGT + ;;;;;;;; @foo acgtacgt + AAAAAAAA @bar comment ACGTACGT + ;;;;;;;; @bar comment acgtacgt + AAAAAAAA ''')
def test_splitFastq(self): path = os.path.dirname(__file__) ngsutils.fastq.barcode_split.fastx_barcode_split(FASTQ(os.path.join(path, 'test_barcodes.fastq')), os.path.join(path, 'out.%s.fastq'), barcodes2, allow_revcomp=True) self.assert_fastq_contains(os.path.join(path, 'out.%s.fastq'), { 'missing': ('quux', '', ''), 'tag1': ('foo foo-rc', 'atcgatcgatcgatcg atcgatcgatcgatcg', 'AAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAA'), 'tag2': ('bar', 'gctagctagctagcta', 'AAAAAAAAAAAAAAAA'), 'tag3': ('baz', 'acgtacgtacgtacgt', 'AAAAAAAAAAAAAAAA') }) self._unlink_fastx(os.path.join(path, 'out.%s.fastq'), 'missing tag1 tag2 tag3'.split())
def testAssert(self): fq1 = StringIO.StringIO('''\ @foo ACGTACGT + ;;;;;;;; ''') fq2 = StringIO.StringIO('''\ @bar acgtacgt + AAAAAAAA ''') out = StringIO.StringIO('') self.assertRaises(ValueError, ngsutils.fastq.merge.fastq_merge, *[ [FASTQ(fileobj=fq1), FASTQ(fileobj=fq2)], ], **{ 'out': out, 'quiet': True })
def testFQRead(self): fq = StringIO.StringIO('''\ @foo T0000111122223333.... + .................... ''') out = StringIO.StringIO('') ngsutils.fastq.csencode.fastq_csencode(FASTQ(fileobj=fq), out=out, quiet=True) self.assertEqual('''@foo AAACCCCGGGGTTTTNNNN + ................... ''', out.getvalue())
def testFilterPaired(self): fq = StringIO.StringIO('''\ @foo ACGTACGTACGTACGT + ;;;;;;;;;;;;;;;; @foo ACGTACGTACGTACGT + ;;;;;;;;;;;;;;;; @bar ACGTACGT + ;;;;;;;; @baz ACGTACGT + ;;;;;;;; @baz ACGTACGT + ;;;;;;;; ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.PairedFilter(chain, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo ACGTACGTACGTACGT + ;;;;;;;;;;;;;;;; @foo ACGTACGTACGTACGT + ;;;;;;;;;;;;;;;; @baz ACGTACGT + ;;;;;;;; @baz ACGTACGT + ;;;;;;;; ''')
def testNames(self): fq = StringIO.StringIO('''\ @foo ACGTACGT + ;;;;;;;; @bar comment ACGTACGT + ;;;;;;;; ''') out = StringIO.StringIO('') ngsutils.fastq.names.export_names(FASTQ(fileobj=fq), out=out, quiet=True) self.assertEqual(out.getvalue(), 'foo\nbar\n')
def testFilterTrimCS(self): fq = StringIO.StringIO('''\ @foo T0123012301231122 + ;;;;;;;;;;;;;;;; @bar T012301231122 + ;;;;;;;;;;;; @baz T012301231102 + ;;;;;;;;;;;; @quux T1122Atcgtagt + ;;;;;;;;;;;; ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.TrimFilter(chain, '1122', 0.8, 3, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo #trim T012301230123 + ;;;;;;;;;;;; @bar #trim T01230123 + ;;;;;;;; @baz T012301231102 + ;;;;;;;;;;;; ''')
def testFilterTrim(self): fq = StringIO.StringIO('''\ @foo ACGTACGTACGTATTT + ;;;;;;;;;;;;;;;; @bar ACGTatttACGT + ;;;;;;;;;;;; @baz ACGTACGTATTC + ;;;;;;;;;;;; @quux ATTTAtcgtagt + ;;;;;;;;;;;; ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.TrimFilter(chain, 'ATTT', 1.0, 3, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo #trim ACGTACGTACGT + ;;;;;;;;;;;; @bar #trim ACGT + ;;;; @baz ACGTACGTATTC + ;;;;;;;;;;;; ''')
def assert_fastq_contains(self, base, args): for tag in args: valid = args[tag][0].split() seq_qual = {} if args[tag][1]: for n, s, q in zip(valid, args[tag][1].split(), args[tag][2].split()): seq_qual[n] = (s, q) fq = FASTQ(base % tag) count = 0 for read in fq.fetch(): if read.name in valid: count += 1 if seq_qual: self.assertEqual(seq_qual[read.name], (read.seq, read.qual)) else: self.assertEqual('extra read in %s' % tag, read.name) self.assertEqual(count, len(valid))
def testFilterTrim2(self): '''Test to make sure that we don't trim out the middle of a read''' fq = StringIO.StringIO('''\ @foo ACGTACGTACGTATTATT + ;;;;;;;;;;;;;;;;;; @bar CCGGATTAGGCCGGCC + ;;;;;;;;;;;;;;;; @baz CCGGATTATTATTATT + ;;;;;;;;;;;;;;;; ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.TrimFilter(chain, 'ATTATT', 1.0, 4, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @foo #trim ACGTACGTACGT + ;;;;;;;;;;;; @bar CCGGATTAGGCCGGCC + ;;;;;;;;;;;;;;;; @baz #trim CCGGATTATT + ;;;;;;;;;; ''')
def testSimple(self): fq = StringIO.StringIO('''\ @foo ACGTACGT + ;;;;;;;; @bar ACGTACGT + ;;;;;;;; @baz ACGTACGTAC + ;;;;;;;;;; @quux ACGTACGTAC + ;;;;;;;;;; ''') # two reads length 8, two reads length 10 stats = ngsutils.fastq.stats.fastq_stats(FASTQ(fileobj=fq), quiet=True) self.assertEqual(stats.total_reads, 4) self.assertEqual(stats.lengths, [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2]) self.assertEqual(stats.totals, [0, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2]) self.assertEqual( stats.qualities, [0, 104, 104, 104, 104, 104, 104, 104, 104, 52, 52]) # accumulator for q, t in zip(stats.qualities, stats.totals)[1:]: self.assertEqual(q / t, 26) self.assertEqual(stats.length_stats.mean, 9) self.assertEqual(stats.length_stats.min_val, 8) self.assertEqual(stats.length_stats.max_val, 10) self.assertEqual(len(stats.quality_stats), 11) for qvstats in stats.quality_stats: if not qvstats: continue self.assertEqual(qvstats.mean, 26)
def testTruncateCS(self): fq = StringIO.StringIO('''\ @foo T01230123 + ;;;;;;;; @bar comment T01230123 + ;;;;;;;; ''') out = StringIO.StringIO('') ngsutils.fastq.truncate.fastq_truncate(FASTQ(fileobj=fq), 4, quiet=True, out=out) self.assertEqual(out.getvalue(), '''\ @foo T0123 + ;;;; @bar comment T0123 + ;;;; ''')
def testTrim53(self): fq = StringIO.StringIO('''\ @foo TTGCaaccggttccttggaaACGT + ;;;;;;;;;;;;;;;;;;;;;;;; @bar - note offset aaTTGCtgtgatagctacgactACGTa + ;;;;;;;;;;;;;;;;;;;;;;;;;;; @baz TAGCtgtagatgatagatagaAGCT + ;;;;;;;;;;;;;;;;;;;;;;;;; ''') out = StringIO.StringIO('') ngsutils.fastq.trim.fastq_trim(FASTQ(fileobj=fq), linker_5='TTGC', linker_3='ACGT', min_len=10, quiet=True, out=out) self.assertEqual( out.getvalue(), '''\ @foo aaccggttccttggaa + ;;;;;;;;;;;;;;;; @bar - note offset tgtgatagctacgact + ;;;;;;;;;;;;;;;; @baz TAGCtgtagatgatagatagaAGCT + ;;;;;;;;;;;;;;;;;;;;;;;;; ''')
def testFilterWildcard(self): fq = StringIO.StringIO('''\ @foo ACGTACNNACGTATTT + ;;;;;;;;;;;;;;;; @bar ACGTACGT + ;;;;;;;; @baz ACGTACGNACTG + ;;;;;;;;;;;; @quux ACG..CGAACTG + ;;;;;;;;;;;; ''') out = StringIO.StringIO('') chain = ngsutils.fastq.filter.FASTQReader(FASTQ(fileobj=fq), verbose=False) chain = ngsutils.fastq.filter.WildcardFilter(chain, 1, verbose=False) ngsutils.fastq.filter.fastq_filter(chain, out=out, quiet=True) self.assertEqual( out.getvalue(), '''\ @bar ACGTACGT + ;;;;;;;; @baz ACGTACGNACTG + ;;;;;;;;;;;; ''')