def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-a", "--first-fastq-file", dest="fastq1", type=str, help="supply read1 fastq file") parser.add_argument( "-b", "--second-fastq-file", dest="fastq2", type=str, help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if unknown and len(unknown) == 2: args.fastq1, args.fastq2 = unknown fastq1 = iotools.open_file(args.fastq1) fastq2 = iotools.open_file(args.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" args.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def filterReadsByPrimerMatch(infile, outfiles): '''Filter out reads where the start of read 1 does not match primer sequence (14bp)''' to_cluster = True primer = "a" if infile.find("_b.") > 0: primer = "b" if primer == "a": primer_seq = PARAMS["grep_primer_a"] else: primer_seq = PARAMS["grep_primer_b"] grep_filter_length = PARAMS["grep_filter_length"] primer_subseq = primer_seq[:grep_filter_length] track = P.snip(os.path.basename(infile), ".fastq.1.gz") infile2 = track + ".fastq.2.gz" outfile1, outfile2 = outfiles tempfile = "filtered/" + track + ".filtered.fastq.1.gz" # filter by primer match fastq_in = open(infile, "r") fastq_out = open(tempfile, "wb") for read in fq.iterate(fastq_in): if read.seq[:grep_filter_length] == primer_subseq: fastq_out.writeln("@" + read.id) fastq_out.writeln(read.seq) fastq_out.writeln("+") fastq_out.writeln(read.qual) fastq_in.close() fastq_out.close() # reconcile read pairs statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-filename-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz''' P.run()
def buildTrueTaxonomicRelativeAbundances(infiles, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan ''' levels = ["species", "genus", "family", "order", "class", "phylum"] taxa = open(infiles[1]) header = taxa.readline() gi2taxa = collections.defaultdict(list) for line in taxa.readlines(): data = line[:-1].split("\t") gi, strain, species, genus, family, order, _class, phylum = data[ 0], data[1], data[2], data[3], data[4], data[5], data[6], data[7] gi2taxa[gi] = (species, genus, family, order, _class, phylum) outf = open(outfile, "w") outf.write("level\ttaxa\trelab\n") for i in range(len(levels)): total = 0 result = collections.defaultdict(int) for fastq in Fastq.iterate(iotools.openFile(infiles[0])): total += 1 gi = fastq.identifier.split("|")[1] result[gi2taxa[gi][i]] += 1 for taxa, value in result.items(): outf.write("%s\t%s\t%s\n" % (levels[i], taxa, float(value) / total)) outf.close()
def buildExpectedCoverageOverGenomes(infiles, outfile): ''' take sequence files and estimate the theoretical coverage we would get over genomes in the sample i.e. at 1X coverage ''' # if paired end then will have to multiply # by two multiply = False if infiles[0].endswith(".fastq.1.gz"): multiply = True # the theoretical coverage is defined as # (read length (L) * no. reads (N)) / genome size (G) (bp) # get genome sizes into memory genomes = open(infiles[1]) header = genomes.readline() genome_sizes = {} for line in genomes.readlines(): data = line[:-1].split("\t") gi = data[0].split("_")[1] size = data[1] genome_sizes[gi] = size # get the expected genome size expected_genome_sizes = collections.defaultdict(int) E.info("iterating over fastq file") for fastq in Fastq.iterate(iotools.openFile(infiles[0])): gi = fastq.identifier.split("|")[1] expected_genome_sizes[gi] += 1 E.info("iterating over fastq file: DONE") # get the proportion of each genome covered outf = open(outfile, "w") outf.write("gi\texpected_coverage\n") for gi, size in expected_genome_sizes.items(): if multiply: size = size * 2 if gi not in genome_sizes: E.warn("could not find gi no. %s in dictionary" % gi) continue proportion_coverage = float(size) / float(genome_sizes[gi]) if proportion_coverage > 1: proportion_coverage = 1 outf.write("%s\t%f\n" % (gi, proportion_coverage)) outf.close()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--fastq1", dest="fastq1") parser.add_option("--to-drop-single", dest='to_remove_singletons') parser.add_option("--fastq-out1", dest="fq_out1") parser.add_option("--fastq-drop1", dest="fq_dropped1") (options, args) = E.start(parser) reads_to_remove = IOTools.open_file( options.to_remove_singletons).readlines() reads_to_remove = set([x.strip() for x in reads_to_remove]) fastq_out = IOTools.open_file(options.fq_out1, 'w') fastq_host = IOTools.open_file(options.fq_dropped1, 'w') reads = 0 dropped_reads = 0 for read in Fastq.iterate(IOTools.open_file(fastq1)): reads += 1 if read.identifier.split()[0] in reads_to_remove: fastq_host.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) dropped_reads += 1 else: fastq_out.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) fastq_out.close() fastq_host.close() try: percent_dropped = dropped_reads / float(reads) * 100 except ZeroDivisionError: percent_dropped = 0.0 E.info('Dropped %i of %i reads (%f percent)' \ % (dropped_reads, reads, percent_dropped))
def process_cgat(options): c = E.Counter() assert options.input_fastq_file == "-" if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "reverse-complement": for record in Fastq.iterate(options.stdin): record.seq = Genomics.complement(record.seq) record.quals = record.quals[::-1] options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = iotools.open_file(options.output_filename_pattern, "w") for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(iotools.open_file(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(iotools.read_list(iotools.open_file(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(iotools.open_file(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = iotools.open_file(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) return c
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults(method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(iotools.open_file(fn1)) iter2 = Fastq.iterate(iotools.open_file(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals s2 = Genomics.reverse_complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in list(offsets.items())]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--fastq1", dest="fastq1") parser.add_option("--fastq2", dest="fastq2") parser.add_option("--fastq3", dest="fastq3") parser.add_option("--to-drop-paired", dest='to_remove_paired') parser.add_option("--to-drop-single", dest='to_remove_singletons') parser.add_option("--fastq-out1", dest="fq_out1") parser.add_option("--fastq-out2", dest="fq_out2") parser.add_option("--fastq-out3", dest="fq_out3") parser.add_option("--fastq-drop1", dest="fq_dropped1") parser.add_option("--fastq-drop2", dest="fq_dropped2") parser.add_option("--fastq-drop3", dest="fq_dropped3") (options, args) = E.start(parser) # Fetch the reads to remove pairs_to_remove = IOTools.open_file(options.to_remove_paired).readlines() pairs_to_remove = set([x.strip() for x in pairs_to_remove]) print(pairs_to_remove) singles_to_remove = IOTools.open_file( options.to_remove_singletons).readlines() singles_to_remove = set([x.strip() for x in singles_to_remove]) # open the outfiles fastq1_out = IOTools.open_file(options.fq_out1, 'w') fastq2_out = IOTools.open_file(options.fq_out2, 'w') fastq3_out = IOTools.open_file(options.fq_out3, 'w') fastq1_host = IOTools.open_file(options.fq_dropped1, 'w') fastq2_host = IOTools.open_file(options.fq_dropped2, 'w') fastq3_host = IOTools.open_file(options.fq_dropped3, 'w') dropped_pairs = 0 pairs = 0 # Drop the paired reads for read1, read2 in zip(Fastq.iterate(IOTools.open_file(options.fastq1)), Fastq.iterate(IOTools.open_file(options.fastq2))): pairs += 1 # bmtagger truncates fastq headers at space and won't accept # non-identical headers therefore, if one read matches, both # are chucked. r1_id = read1.identifier.split()[0] r2_id = read2.identifier.split()[0] print(r1_id) print(r2_id) if r1_id in pairs_to_remove or r2_id in pairs_to_remove: # Both are host fastq1_host.write("@%s\n%s\n+\n%s\n" % (read1.identifier, read1.seq, read1.quals)) fastq2_host.write("@%s\n%s\n+\n%s\n" % (read2.identifier, read2.seq, read2.quals)) dropped_pairs += 1 else: # Neither are host fastq1_out.write("@%s\n%s\n+\n%s\n" % (read1.identifier, read1.seq, read1.quals)) fastq2_out.write("@%s\n%s\n+\n%s\n" % (read2.identifier, read2.seq, read2.quals)) # Drop singletons singletons = 0 dropped_singletons = 0 for read in Fastq.iterate(IOTools.open_file(options.fastq3)): singletons += 1 if read.identifier.split()[0] in singles_to_remove: fastq3_host.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) dropped_singletons += 1 else: fastq3_out.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) fastq1_out.close() fastq2_out.close() fastq3_out.close() fastq1_host.close() fastq2_host.close() fastq3_host.close() try: percent_pairs = dropped_pairs / float(pairs) * 100 except ZeroDivisionError: percent_pairs = 0.0 try: percent_singletons = dropped_singletons / float(singletons) * 100 except ZeroDivisionError: percent_singletons = 0.0 E.info('Dropped %i of %i read pairs (%f percent)' \ % (dropped_pairs, pairs, percent_pairs)) E.info('Dropped %i of %i singletons (%f percent)' \ % (dropped_singletons, singletons, percent_singletons))