def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults( change_format=None, guess_format=None, pattern="%s.gz" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w") outfile_qual = IOTools.openFile(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option("--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option("--apply", dest="apply", type="string", help="apply a filter to fastq file (taking only reads in filename) [default=%default].") parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option("--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option("--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults( change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") E.warn( "consider sorting individual fastq files - this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess \ the quality format of the input fastq file. The user \ can specify the quality format of the input file using \ the --format option. The script will use this format if \ sequences qualities are ambiguous.[default=%default].") parser.add_option( "-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script guesses the quality format of the input \ file and converts quality scores to the destination \ format unless --format is specified [default=%default].") parser.set_defaults(change_format=None, guess_format=None, min_quality=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) min_quality = options.min_quality number_of_reads = 0 number_of_bases = 0 read_lengths = [] read_qualities = [] bases_below_min = 0 for record in iterator: number_of_reads += 1 quals = record.toPhred() length_read = len(quals) number_of_bases += length_read bases_below_min += len([x for x in quals if x < min_quality]) read_lengths.append(length_read) read_qualities.append(np.mean(quals)) mean_length = round(np.mean(read_lengths), 2) median_length = round(np.median(read_lengths), 2) mean_quality = round(np.mean(read_qualities), 2) median_quality = round(np.median(read_qualities), 2) options.stdout.write( "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n" ) options.stdout.write("%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases, str(mean_length), str(median_length), str(mean_quality), str(median_quality), bases_below_min)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="set quality scores to format " "[default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern-identifier", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.open_file(options.pattern % "csfasta", "w") outfile_qual = IOTools.open_file(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=( "apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "grep"), help="method to apply [%default]") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option( "--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option( "--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults( method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError( "please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_filename_pattern, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn( "consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help= "guess quality score format and set quality scores to format [default=%default]." ) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option( "--apply", dest="apply", type="string", help= "apply a filter to fastq file (taking only reads in filename) [default=%default]." ) parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults(change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)" ) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)" ) E.warn( "consider sorting individual fastq files - this is memory intensive" ) entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len( set(entries1.keys()).intersection(set(entries2.keys())) ) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) ## write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess \ the quality format of the input fastq file. The user \ can specify the quality format of the input file using \ the --format option. The script will use this format if \ sequences qualities are ambiguous.[default=%default].") parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script guesses the quality format of the input \ file and converts quality scores to the destination \ format unless --format is specified [default=%default].") parser.set_defaults( change_format=None, guess_format=None, min_quality=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) min_quality = options.min_quality number_of_reads = 0 number_of_bases = 0 read_lengths = [] read_qualities = [] bases_below_min = 0 for record in iterator: number_of_reads += 1 quals = record.toPhred() length_read = len(quals) number_of_bases += length_read bases_below_min += len([x for x in quals if x < min_quality]) read_lengths.append(length_read) read_qualities.append(np.mean(quals)) mean_length = round(np.mean(read_lengths), 2) median_length = round(np.median(read_lengths), 2) mean_quality = round(np.mean(read_qualities), 2) median_quality = round(np.median(read_qualities), 2) options.stdout.write( "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n") options.stdout.write( "%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases, str(mean_length), str(median_length), str(mean_quality), str(median_quality), bases_below_min)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "grep"), help="method to apply [%default]") parser.add_option("--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option("--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option("--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults(method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("--guess-format", dest="guess_format", type="choice", choices=( 'sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality format of the input fastq file. The user can specify \ the quality format of the input file using the --format option. The script will use this format if the \ sequence qualities are ambiguous.[default=%default]." ) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=( 'sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will guess the quality format of the input file and convert \ quality scores to the destination format unless --format is specified [default=%default]." ) parser.set_defaults( change_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write("%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)) )) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def process_cgat(options): c = E.Counter() assert options.input_fastq_file == "-" if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "reverse-complement": for record in Fastq.iterate(options.stdin): record.seq = Genomics.complement(record.seq) record.quals = record.quals[::-1] options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.open_file(options.output_filename_pattern, "w") for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.open_file(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.read_list(IOTools.open_file(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.open_file(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.open_file(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) return c