def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-a", "--first-fastq-file", dest="fastq1", type=str, help="supply read1 fastq file") parser.add_argument( "-b", "--second-fastq-file", dest="fastq2", type=str, help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if unknown and len(unknown) == 2: args.fastq1, args.fastq2 = unknown fastq1 = iotools.open_file(args.fastq1) fastq2 = iotools.open_file(args.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" args.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def build(self, infiles, outfiles, output_prefix): prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) outdir = os.path.join(output_prefix + ".dir") track = os.path.basename(output_prefix) processing_options = self.processing_options threads = self.threads infile1, infile2 = infiles outfile = outfiles[0] cmd = '''pandaseq -f %(infile1)s -r %(infile2)s %(processing_options)s -T %(threads)i -U >(gzip > %(outfile)s.unpaired.gz) -w >(gzip > %(outfile)s) -F -G %(output_prefix)s-pandaseq.log.bgz; >& %(output_prefix)s-pandaseq.log; gzip %(outdir)s/*; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options r = {33: 'sanger', 64: 'illumina', 59: 'solexa'} quality = r[offset] if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] cmd = '''sickle se -g %(processing_options)s --qual-type %(quality)s --output-file %(outfile)s --fastq-file %(infile)s 2>>%(output_prefix)s.log ;''' % locals() elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''sickle pe -g -s %(processing_options)s --qual-type %(quality)s -f %(infile1)s -r %(infile2)s -o %(outfile1)s -p %(outfile2)s 2>>%(output_prefix)s.log ;''' % locals() return cmd
def filterReadsByPrimerMatch(infile, outfiles): '''Filter out reads where the start of read 1 does not match primer sequence (14bp)''' to_cluster = True primer = "a" if infile.find("_b.") > 0: primer = "b" if primer == "a": primer_seq = PARAMS["grep_primer_a"] else: primer_seq = PARAMS["grep_primer_b"] grep_filter_length = PARAMS["grep_filter_length"] primer_subseq = primer_seq[:grep_filter_length] track = P.snip(os.path.basename(infile), ".fastq.1.gz") infile2 = track + ".fastq.2.gz" outfile1, outfile2 = outfiles tempfile = "filtered/" + track + ".filtered.fastq.1.gz" # filter by primer match fastq_in = open(infile, "r") fastq_out = open(tempfile, "wb") for read in fq.iterate(fastq_in): if read.seq[:grep_filter_length] == primer_subseq: fastq_out.writeln("@" + read.id) fastq_out.writeln(read.seq) fastq_out.writeln("+") fastq_out.writeln(read.qual) fastq_in.close() fastq_out.close() # reconcile read pairs statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-filename-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz''' P.run()
def buildTrueTaxonomicRelativeAbundances(infiles, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan ''' levels = ["species", "genus", "family", "order", "class", "phylum"] taxa = open(infiles[1]) header = taxa.readline() gi2taxa = collections.defaultdict(list) for line in taxa.readlines(): data = line[:-1].split("\t") gi, strain, species, genus, family, order, _class, phylum = data[ 0], data[1], data[2], data[3], data[4], data[5], data[6], data[7] gi2taxa[gi] = (species, genus, family, order, _class, phylum) outf = open(outfile, "w") outf.write("level\ttaxa\trelab\n") for i in range(len(levels)): total = 0 result = collections.defaultdict(int) for fastq in Fastq.iterate(iotools.openFile(infiles[0])): total += 1 gi = fastq.identifier.split("|")[1] result[gi2taxa[gi][i]] += 1 for taxa, value in result.items(): outf.write("%s\t%s\t%s\n" % (levels[i], taxa, float(value) / total)) outf.close()
def buildExpectedCoverageOverGenomes(infiles, outfile): ''' take sequence files and estimate the theoretical coverage we would get over genomes in the sample i.e. at 1X coverage ''' # if paired end then will have to multiply # by two multiply = False if infiles[0].endswith(".fastq.1.gz"): multiply = True # the theoretical coverage is defined as # (read length (L) * no. reads (N)) / genome size (G) (bp) # get genome sizes into memory genomes = open(infiles[1]) header = genomes.readline() genome_sizes = {} for line in genomes.readlines(): data = line[:-1].split("\t") gi = data[0].split("_")[1] size = data[1] genome_sizes[gi] = size # get the expected genome size expected_genome_sizes = collections.defaultdict(int) E.info("iterating over fastq file") for fastq in Fastq.iterate(iotools.openFile(infiles[0])): gi = fastq.identifier.split("|")[1] expected_genome_sizes[gi] += 1 E.info("iterating over fastq file: DONE") # get the proportion of each genome covered outf = open(outfile, "w") outf.write("gi\texpected_coverage\n") for gi, size in expected_genome_sizes.items(): if multiply: size = size * 2 if gi not in genome_sizes: E.warn("could not find gi no. %s in dictionary" % gi) continue proportion_coverage = float(size) / float(genome_sizes[gi]) if proportion_coverage > 1: proportion_coverage = 1 outf.write("%s\t%f\n" % (gi, proportion_coverage)) outf.close()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--fastq1", dest="fastq1") parser.add_option("--to-drop-single", dest='to_remove_singletons') parser.add_option("--fastq-out1", dest="fq_out1") parser.add_option("--fastq-drop1", dest="fq_dropped1") (options, args) = E.start(parser) reads_to_remove = IOTools.open_file( options.to_remove_singletons).readlines() reads_to_remove = set([x.strip() for x in reads_to_remove]) fastq_out = IOTools.open_file(options.fq_out1, 'w') fastq_host = IOTools.open_file(options.fq_dropped1, 'w') reads = 0 dropped_reads = 0 for read in Fastq.iterate(IOTools.open_file(fastq1)): reads += 1 if read.identifier.split()[0] in reads_to_remove: fastq_host.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) dropped_reads += 1 else: fastq_out.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) fastq_out.close() fastq_host.close() try: percent_dropped = dropped_reads / float(reads) * 100 except ZeroDivisionError: percent_dropped = 0.0 E.info('Dropped %i of %i reads (%f percent)' \ % (dropped_reads, reads, percent_dropped))
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options assert len(infiles) == len(outfiles) cmds = [] for infile, outfile in zip(infiles, outfiles): cmds.append('''zcat %(infile)s | fastx_trimmer -Q%(offset)s %(processing_options)s 2>> %(output_prefix)s.log | gzip > %(outfile)s ;''' % locals()) return " ; ".join(cmds)
def build(self, infiles, outfiles, output_prefix): prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) outdir = os.path.join(output_prefix + ".dir") track = os.path.basename(output_prefix) processing_options = self.processing_options infile1, infile2 = infiles outfile = outfiles[0] cmd = '''flash %(infile1)s %(infile2)s -p %(offset)s %(processing_options)s -o %(track)s -d %(outdir)s >& %(output_prefix)s-flash.log; gzip %(outdir)s/*; mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] outdir = os.path.dirname(outfile) trim_out = "%s/%s_trimmed.fq.gz" % (outdir, infile.replace(".fastq.gz", "")) cmd = '''trim_galore %(processing_options)s --phred%(offset)s --output_dir %(outdir)s %(infile)s 2>>%(output_prefix)s.log; mv %(trim_out)s %(outfile)s; ''' % locals() outfiles = (outfile,) elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles outdir = os.path.dirname(outfile1) cmd = '''trim_galore %(processing_options)s --paired --phred%(offset)s --output_dir %(outdir)s %(infile1)s %(infile2)s 2>>%(output_prefix)s.log; mv %(outdir)s/%(infile1)s_val_1.fq.gz %(outfile1)s; mv %(outdir)s/%(infile2)s_val_2.fq.gz %(outfile2)s; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) threads = self.threads processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] cmd = '''trimmomatic SE -threads %(threads)i -phred%(offset)s %(infile)s %(outfile)s %(processing_options)s 2>> %(output_prefix)s.log ;''' % locals() elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''trimmomatic PE -threads %(threads)i -phred%(offset)s %(infile1)s %(infile2)s %(outfile1)s %(output_prefix)s.1.unpaired %(outfile2)s %(output_prefix)s.2.unpaired %(processing_options)s 2>> %(output_prefix)s.log; gzip %(output_prefix)s.*.unpaired; ''' % locals() return cmd
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess \ the quality format of the input fastq file. The user \ can specify the quality format of the input file using \ the --format option. The script will use this format if \ sequences qualities are ambiguous.[default=%default].") parser.add_option( "-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script guesses the quality format of the input \ file and converts quality scores to the destination \ format unless --format is specified [default=%default].") parser.set_defaults( change_format=None, guess_format=None, min_quality=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) min_quality = options.min_quality number_of_reads = 0 number_of_bases = 0 read_lengths = [] read_qualities = [] bases_below_min = 0 for record in iterator: number_of_reads += 1 quals = record.toPhred() length_read = len(quals) number_of_bases += length_read bases_below_min += len([x for x in quals if x < min_quality]) read_lengths.append(length_read) read_qualities.append(np.mean(quals)) mean_length = round(np.mean(read_lengths), 2) median_length = round(np.median(read_lengths), 2) mean_quality = round(np.mean(read_qualities), 2) median_quality = round(np.median(read_qualities), 2) options.stdout.write( "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n") options.stdout.write( "%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases, str(mean_length), str(median_length), str(mean_quality), str(median_quality), bases_below_min)) E.stop()
def process_cgat(options): c = E.Counter() assert options.input_fastq_file == "-" if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "reverse-complement": for record in Fastq.iterate(options.stdin): record.seq = Genomics.complement(record.seq) record.quals = record.quals[::-1] options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = iotools.open_file(options.output_filename_pattern, "w") for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(iotools.open_file(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(iotools.read_list(iotools.open_file(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(iotools.open_file(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = iotools.open_file(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) return c
def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters ---------- outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. Returns ------- files : list A list of fastq formatted files that are contained in the archive. format : string The quality score format in the :term:`fastq` formatted files. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith("_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(iotools.open_file(f[0], "r"), raises=False) fastq_datatype = Fastq.guessDataType(iotools.open_file(f[0], "r"), raises=True) if outdir is None: shutil.rmtree(workdir) return f, fastq_format, fastq_datatype
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults(method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(iotools.open_file(fn1)) iter2 = Fastq.iterate(iotools.open_file(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals s2 = Genomics.reverse_complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in list(offsets.items())]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--fastq1", dest="fastq1") parser.add_option("--fastq2", dest="fastq2") parser.add_option("--fastq3", dest="fastq3") parser.add_option("--to-drop-paired", dest='to_remove_paired') parser.add_option("--to-drop-single", dest='to_remove_singletons') parser.add_option("--fastq-out1", dest="fq_out1") parser.add_option("--fastq-out2", dest="fq_out2") parser.add_option("--fastq-out3", dest="fq_out3") parser.add_option("--fastq-drop1", dest="fq_dropped1") parser.add_option("--fastq-drop2", dest="fq_dropped2") parser.add_option("--fastq-drop3", dest="fq_dropped3") (options, args) = E.start(parser) # Fetch the reads to remove pairs_to_remove = IOTools.open_file(options.to_remove_paired).readlines() pairs_to_remove = set([x.strip() for x in pairs_to_remove]) print(pairs_to_remove) singles_to_remove = IOTools.open_file( options.to_remove_singletons).readlines() singles_to_remove = set([x.strip() for x in singles_to_remove]) # open the outfiles fastq1_out = IOTools.open_file(options.fq_out1, 'w') fastq2_out = IOTools.open_file(options.fq_out2, 'w') fastq3_out = IOTools.open_file(options.fq_out3, 'w') fastq1_host = IOTools.open_file(options.fq_dropped1, 'w') fastq2_host = IOTools.open_file(options.fq_dropped2, 'w') fastq3_host = IOTools.open_file(options.fq_dropped3, 'w') dropped_pairs = 0 pairs = 0 # Drop the paired reads for read1, read2 in zip(Fastq.iterate(IOTools.open_file(options.fastq1)), Fastq.iterate(IOTools.open_file(options.fastq2))): pairs += 1 # bmtagger truncates fastq headers at space and won't accept # non-identical headers therefore, if one read matches, both # are chucked. r1_id = read1.identifier.split()[0] r2_id = read2.identifier.split()[0] print(r1_id) print(r2_id) if r1_id in pairs_to_remove or r2_id in pairs_to_remove: # Both are host fastq1_host.write("@%s\n%s\n+\n%s\n" % (read1.identifier, read1.seq, read1.quals)) fastq2_host.write("@%s\n%s\n+\n%s\n" % (read2.identifier, read2.seq, read2.quals)) dropped_pairs += 1 else: # Neither are host fastq1_out.write("@%s\n%s\n+\n%s\n" % (read1.identifier, read1.seq, read1.quals)) fastq2_out.write("@%s\n%s\n+\n%s\n" % (read2.identifier, read2.seq, read2.quals)) # Drop singletons singletons = 0 dropped_singletons = 0 for read in Fastq.iterate(IOTools.open_file(options.fastq3)): singletons += 1 if read.identifier.split()[0] in singles_to_remove: fastq3_host.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) dropped_singletons += 1 else: fastq3_out.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) fastq1_out.close() fastq2_out.close() fastq3_out.close() fastq1_host.close() fastq2_host.close() fastq3_host.close() try: percent_pairs = dropped_pairs / float(pairs) * 100 except ZeroDivisionError: percent_pairs = 0.0 try: percent_singletons = dropped_singletons / float(singletons) * 100 except ZeroDivisionError: percent_singletons = 0.0 E.info('Dropped %i of %i read pairs (%f percent)' \ % (dropped_pairs, pairs, percent_pairs)) E.info('Dropped %i of %i singletons (%f percent)' \ % (dropped_singletons, singletons, percent_singletons))