def test_check_is_pair_4b(): read1 = FakeFastaRead(name="seq/1", sequence="AAA") read2 = FakeFQRead(name="seq/2", quality="###", sequence="AAA") try: check_is_pair(read1, read2) assert False # check_is_pair should fail here. except ValueError: pass
def test_check_is_pair_4(): read1 = FakeFQRead(name='seq/1', quality='###', sequence='AAA') read2 = FakeFastaRead(name='seq/2', sequence='AAA') try: check_is_pair(read1, read2) assert False # check_is_pair should fail here. except ValueError: pass
def test_check_is_pair_4b(): read1 = screed.Record(name='seq/1', sequence='AAA') read2 = screed.Record(name='seq/2', quality='###', sequence='AAA') try: check_is_pair(read1, read2) assert False # check_is_pair should fail here. except ValueError: pass
def test_check_is_pair_4(): read1 = screed.Record(name='seq/1', quality='###', sequence='AAA') read2 = screed.Record(name='seq/2', sequence='AAA') try: check_is_pair(read1, read2) assert False # check_is_pair should fail here. except ValueError: pass
def test_check_is_pair_4b(): read1 = FakeFastaRead(name='seq/1', sequence='AAA') read2 = FakeFQRead(name='seq/2', quality='###', sequence='AAA') try: check_is_pair(read1, read2) assert False # check_is_pair should fail here. except ValueError: pass
def push_sequences(self, inputiter): batch = [] last_record = None i = 0 for record in inputiter: if i >= self.group_size: # keep pairs together in batches, to retain the interleaving. if check_is_pair(last_record, record): batch.append(record) grouping = SequenceGroup(0, batch) self.inqueue.put(grouping) batch = [] else: grouping = SequenceGroup(0, batch) self.inqueue.put(grouping) batch = [record] i = 0 else: batch.append(record) last_record = record i += 1 # submit last set of sequences if batch: grouping = SequenceGroup(0, batch) self.inqueue.put(grouping)
def push_sequences(self, inputiter): batch = [] last_record = None i = 0 for record in inputiter: if i >= self.group_size: # keep pairs together in batches, to retain the interleaving. if check_is_pair(last_record, record): batch.append(record) g = SequenceGroup(0, batch) self.inqueue.put(g) batch = [] else: g = SequenceGroup(0, batch) self.inqueue.put(g) batch = [record] i = 0 else: batch.append(record) last_record = record i += 1 # submit last set of sequences if batch: g = SequenceGroup(0, batch) self.inqueue.put(g)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right fail = False print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def WithDiagnostics(ifile, batch_size, fp, paired, norm): """ Generator/context manager to do boilerplate output of statistics while normalizing data. Also checks for properly paired data. """ index = 0 for index, batch in enumerate(batchwise( screed.open(ifile, parse_description=False), batch_size)): norm.total += batch_size total = norm.total discarded = norm.discarded if index > 0 and index % 100000 == 0: print('... kept {kept} of {total} or {perc:2}%' .format(kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)), file=sys.stderr) print('... in file ' + input_filename, file=sys.stderr) if report_fp: print(total + " " + total - discarded + " " + 1. - (discarded / float(total)), file=fp) report_fp.flush() # If in paired mode, check that the reads are properly interleaved if paired: if not check_is_pair(batch[0], batch[1]): raise IOError('Error: Improperly interleaved pairs \ {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name)) yield batch
def test_check_is_pair_3_fa(): read1 = FakeFastaRead(name="seq 1::", sequence="AAA") read2 = FakeFastaRead(name="seq 2::", sequence="AAA") assert check_is_pair(read1, read2)
def test_check_is_pair_3_broken_fq_2(): read1 = screed.Record(name='seq 1::', quality='###', sequence='AAA') read2 = screed.Record(name='seq', quality='###', sequence='AAA') assert not check_is_pair(read1, read2)
def test_check_is_pair_3_fa(): read1 = screed.Record(name='seq 1::', sequence='AAA') read2 = screed.Record(name='seq 2::', sequence='AAA') assert check_is_pair(read1, read2)
def test_check_is_pair_7(): read1 = FakeFastaRead(name="seq/2", sequence="AAA") read2 = FakeFastaRead(name="seq/1", sequence="AAA") assert not check_is_pair(read1, read2)
def test_check_is_pair_2(): read1 = screed.Record(name='seq/1', quality='###', sequence='AAA') read2 = screed.Record(name='seq/2', quality='###', sequence='AAA') assert check_is_pair(read1, read2)
def test_check_is_pair_2(): read1 = FakeFQRead(name='seq/1', quality='###', sequence='AAA') read2 = FakeFQRead(name='seq/2', quality='###', sequence='AAA') assert check_is_pair(read1, read2)
def test_check_is_pair_3_broken_fq_1(): read1 = screed.Record(name='seq', quality='###', sequence='AAA') read2 = screed.Record(name='seq 2::', quality='###', sequence='AAA') assert not check_is_pair(read1, read2)
def test_check_is_pair_3_broken_fq_2(): read1 = FakeFQRead(name="seq 1::", quality="###", sequence="AAA") read2 = FakeFQRead(name="seq", quality="###", sequence="AAA") assert not check_is_pair(read1, read2)
def test_check_is_pair_7(): read1 = FakeFastaRead(name='seq/2', sequence='AAA') read2 = FakeFastaRead(name='seq/1', sequence='AAA') assert not check_is_pair(read1, read2)
def test_check_is_pair_3_fa(): read1 = FakeFastaRead(name='seq 1::', sequence='AAA') read2 = FakeFastaRead(name='seq 2::', sequence='AAA') assert check_is_pair(read1, read2)
def test_check_is_pair_3_broken_fq_1(): read1 = FakeFQRead(name='seq', quality='###', sequence='AAA') read2 = FakeFQRead(name='seq 2::', quality='###', sequence='AAA') assert not check_is_pair(read1, read2)
def test_check_is_pair_7(): read1 = screed.Record(name='seq/2', sequence='AAA') read2 = screed.Record(name='seq/1', sequence='AAA') assert not check_is_pair(read1, read2)
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >>sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >>sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def test_check_is_pair_3_broken_fq_2(): read1 = FakeFQRead(name='seq 1::', quality='###', sequence='AAA') read2 = FakeFQRead(name='seq', quality='###', sequence='AAA') assert not check_is_pair(read1, read2)
def normalize_by_median(input_filename, outfp, htable, args, report_fp=None): desired_coverage = args.cutoff ksize = htable.ksize() # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 index = -1 total = 0 discarded = 0 for index, batch in enumerate( batchwise(screed.open(input_filename, parse_description=False), batch_size)): if index > 0 and index % 100000 == 0: print >>sys.stderr, '... kept {kept} of {total} or'\ ' {perc:2}%'.format(kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print >> sys.stderr, '... in file', input_filename if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not check_is_pair(batch[0], batch[1]): raise IOError('Error: Improperly interleaved pairs \ {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name)) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < ksize: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = htable.get_median_count(seq) if med < desired_coverage: htable.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: write_record(record, outfp) else: discarded += batch_size if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() return total, discarded
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >> sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >> sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def normalize_by_median(input_filename, outfp, htable, paired, cutoff, report_fp=None): desired_coverage = cutoff ksize = htable.ksize() # In paired mode we read two records at a time batch_size = 1 if paired: batch_size = 2 index = -1 total = 0 discarded = 0 for index, batch in enumerate(batchwise(screed.open( input_filename, parse_description=False), batch_size)): if index > 0 and index % 100000 == 0: print >>sys.stderr, '... kept {kept} of {total} or'\ ' {perc:2}%'.format(kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print >>sys.stderr, '... in file', input_filename if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if paired: if not check_is_pair(batch[0], batch[1]): raise IOError('Error: Improperly interleaved pairs \ {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name)) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < ksize: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = htable.get_median_count(seq) if med < desired_coverage: htable.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: write_record(record, outfp) else: discarded += batch_size if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() return total, discarded
def test_check_is_pair_2(): read1 = FakeFQRead(name="seq/1", quality="###", sequence="AAA") read2 = FakeFQRead(name="seq/2", quality="###", sequence="AAA") assert check_is_pair(read1, read2)