def test_check_is_right(): assert not check_is_right('seq1/1') assert not check_is_right('seq1 1::N') assert check_is_right('seq1/2') assert check_is_right('seq1 2::N') assert not check_is_right('seq') assert not check_is_right('seq 2')
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right fail = False print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def test_check_is_right(): assert not check_is_right("seq1/1") assert not check_is_right("seq1 1::N") assert check_is_right("seq1/2") assert check_is_right("seq1 2::N") assert not check_is_right("seq") assert not check_is_right("seq 2")
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >>sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >>sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >> sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >>sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >> sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >> sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if infile == '/dev/stdin' or infile == '-': if not (args.output_first and args.output_second): print >> sys.stderr, ("Accepting input from stdin; " "output filenames must be provided.") sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = args.output_first out1 = fp_out1.name else: # Use default filename created above fp_out1 = open(out1, 'w') if args.output_second: fp_out2 = args.output_second out2 = fp_out2.name else: # Use default filename created above fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter) for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) # are we requiring pairs? if args.force_paired and not is_pair: print('ERROR, %s is not part of a pair' % record1.name, file=sys.stderr) sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print("Unrecognized format for read pair information: %s" % name, file=sys.stderr) print("Exiting.", file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right)" % (counter1 + counter2, counter1, counter2), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr)