def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-a", "--fastq1", dest="fastq1", type="string", help="supply read1 fastq file" ) parser.add_option("-b", "--fastq2", dest="fastq2", type="string", help="supply read2 fastq file" ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) fastq1 = IOTools.openFile(options.fastq1) fastq2 = IOTools.openFile(options.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?") except PairedReadError, e: raise PairedReadError(e), None, sys.exc_info()[2] else: assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2" options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults( change_format=None, guess_format=None, pattern="%s.gz" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w") outfile_qual = IOTools.openFile(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-a", "--first-fastq-file", dest="fastq1", type="string", help="supply read1 fastq file") parser.add_option("-b", "--second-fastq-file", dest="fastq2", type="string", help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if args and len(args) == 2: options.fastq1, options.fastq2 = args fastq1 = IOTools.open_file(options.fastq1) fastq2 = IOTools.open_file(options.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" options.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-a", "--first-fastq-file", dest="fastq1", type="string", help="supply read1 fastq file") parser.add_option( "-b", "--second-fastq-file", dest="fastq2", type="string", help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if args and len(args) == 2: options.fastq1, options.fastq2 = args fastq1 = IOTools.openFile(options.fastq1) fastq2 = IOTools.openFile(options.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" options.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.Stop()
def buildTrueTaxonomicRelativeAbundances(infiles, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan ''' levels = ["species", "genus", "family", "order", "class", "phylum"] taxa = open(infiles[1]) header = taxa.readline() gi2taxa = collections.defaultdict(list) for line in taxa.readlines(): data = line[:-1].split("\t") gi, strain, species, genus, family, order, _class, phylum = data[ 0], data[1], data[2], data[3], data[4], data[5], data[6], data[7] gi2taxa[gi] = (species, genus, family, order, _class, phylum) outf = open(outfile, "w") outf.write("level\ttaxa\trelab\n") for i in range(len(levels)): total = 0 result = collections.defaultdict(int) for fastq in Fastq.iterate(IOTools.openFile(infiles[0])): total += 1 gi = fastq.identifier.split("|")[1] result[gi2taxa[gi][i]] += 1 for taxa, value in result.iteritems(): outf.write("%s\t%s\t%s\n" % (levels[i], taxa, float(value) / total)) outf.close()
def build(self, infiles, outfiles, output_prefix): prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) outdir = os.path.join(output_prefix + ".dir") track = os.path.basename(output_prefix) processing_options = self.processing_options infile1, infile2 = infiles outfile = outfiles[0] cmd = '''flash %(infile1)s %(infile2)s -p %(offset)s %(processing_options)s -o %(track)s -d %(outdir)s >& %(output_prefix)s-flash.log; checkpoint; gzip %(outdir)s/*; checkpoint; mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) outdir = os.path.join(output_prefix + ".dir") track = os.path.basename(output_prefix) processing_options = self.processing_options threads = self.threads infile1, infile2 = infiles outfile = outfiles[0] cmd = '''pandaseq -f %(infile1)s -r %(infile2)s %(processing_options)s -T %(threads)i -U >(gzip > %(outfile)s.unpaired.gz) -w >(gzip > %(outfile)s) -F -G %(output_prefix)s-pandaseq.log.bgz; >& %(output_prefix)s-pandaseq.log; checkpoint; gzip %(outdir)s/*; checkpoint; ''' % locals() return cmd
def filterReadsByPrimerMatch(infile, outfiles): '''Filter out reads where the start of read 1 does not match primer sequence (14bp)''' to_cluster = True primer = "a" if infile.find("_b.") > 0: primer = "b" if primer == "a": primer_seq = PARAMS["grep_primer_a"] else: primer_seq = PARAMS["grep_primer_b"] grep_filter_length = PARAMS["grep_filter_length"] primer_subseq = primer_seq[:grep_filter_length] track = P.snip(os.path.basename(infile), ".fastq.1.gz") infile2 = track + ".fastq.2.gz" outfile1, outfile2 = outfiles tempfile = "filtered/" + track + ".filtered.fastq.1.gz" # filter by primer match fastq_in = open(infile, "r") fastq_out = open(tempfile, "wb") for read in fq.iterate(fastq_in): if read.seq[:grep_filter_length] == primer_subseq: fastq_out.writeln("@" + read.id) fastq_out.writeln(read.seq) fastq_out.writeln("+") fastq_out.writeln(read.qual) fastq_in.close() fastq_out.close() # reconcile read pairs statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-filename-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz''' P.run()
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options r = {33: 'sanger', 64: 'illumina', 59: 'solexa'} quality = r[offset] if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] cmd = '''sickle se -g %(processing_options)s --qual-type %(quality)s --output-file %(outfile)s --fastq-file %(infile)s 2>>%(output_prefix)s.log ;''' % locals() elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''sickle pe -g -s %(processing_options)s --qual-type %(quality)s -f %(infile1)s -r %(infile2)s -o %(outfile1)s -p %(outfile2)s 2>>%(output_prefix)s.log ;''' % locals() return cmd
def filterReadsByPrimerMatch(infile, outfiles): '''Filter out reads where the start of read 1 does not match primer sequence (14bp)''' to_cluster = True primer = "a" if infile.find("_b.") > 0: primer = "b" if primer == "a": primer_seq = PARAMS["grep_primer_a"] else: primer_seq = PARAMS["grep_primer_b"] grep_filter_length = PARAMS["grep_filter_length"] primer_subseq = primer_seq[:grep_filter_length] track = P.snip(os.path.basename(infile), ".fastq.1.gz") infile2 = track + ".fastq.2.gz" outfile1, outfile2 = outfiles tempfile = "filtered/" + track + ".filtered.fastq.1.gz" # filter by primer match fastq_in = open(infile, "r") fastq_out = open(tempfile, "wb") for read in fq.iterate(fastq_in): if read.seq[:grep_filter_length] == primer_subseq: fastq_out.writeln("@" + read.id) fastq_out.writeln(read.seq) fastq_out.writeln("+") fastq_out.writeln(read.qual) fastq_in.close() fastq_out.close() # reconcile read pairs statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz''' P.run()
def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters: outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) if outdir is None: shutil.rmtree(workdir) return f, fastq_format
def replace(fastqfile, baseToReplace): '''replaces the specified base with N''' # use gzip as default to open the fastq file outf = gzip.open("replaced_" + fastqfile, "w") fastq = gzip.open(fastqfile) iterator = Fastq.iterate(fastq) for record in iterator: x = list(record.seq) x[int(baseToReplace)] = "N" record.seq = "".join(x) outf.write("@" + record.identifier + "\n" + record.seq + "\n" + "+" + record.identifier + "\n" + record.quals + "\n")
def buildExpectedCoverageOverGenomes(infiles, outfile): ''' take sequence files and estimate the theoretical coverage we would get over genomes in the sample i.e. at 1X coverage ''' # if paired end then will have to multiply # by two multiply = False if infiles[0].endswith(".fastq.1.gz"): multiply = True # the theoretical coverage is defined as # (read length (L) * no. reads (N)) / genome size (G) (bp) # get genome sizes into memory genomes = open(infiles[1]) header = genomes.readline() genome_sizes = {} for line in genomes.readlines(): data = line[:-1].split("\t") gi = data[0].split("_")[1] size = data[1] genome_sizes[gi] = size # get the expected genome size expected_genome_sizes = collections.defaultdict(int) E.info("iterating over fastq file") for fastq in Fastq.iterate(IOTools.openFile(infiles[0])): gi = fastq.identifier.split("|")[1] expected_genome_sizes[gi] += 1 E.info("iterating over fastq file: DONE") # get the proportion of each genome covered outf = open(outfile, "w") outf.write("gi\texpected_coverage\n") for gi, size in expected_genome_sizes.iteritems(): if multiply: size = size * 2 if gi not in genome_sizes: E.warn("could not find gi no. %s in dictionary" % gi) continue proportion_coverage = float(size) / float(genome_sizes[gi]) if proportion_coverage > 1: proportion_coverage = 1 outf.write("%s\t%f\n" % (gi, proportion_coverage)) outf.close()
def build(self, infile, outfile, processer_list): '''run mapper.''' f_format = Fastq.guessFormat( IOTools.openFile(infile[0], "r"), raises=False) cmd_process, cmd_post, processed_files = self.process( infile[0], processer_list, outfile, f_format, save=self.save) cmd_clean = self.cleanup(outfile) assert cmd_process.strip().endswith(";") assert cmd_post.strip().endswith(";") assert cmd_clean.strip().endswith(";") statement = " checkpoint; ".join((cmd_process, cmd_post, cmd_clean)) return statement
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-pm", "--profilematrix", dest="matrixfile", type="string", help="name of profile file you want to convert") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) #outf = IOTools.openFile("my_output", "w") for line in options.matrixfile: line = line.strip() fields = line.split() total = sum([float(col) for col in fields[1:]]) if total == 0: continue else: for i, col in enumerate(fields): if i == 0: continue fields[i] = col / total options.stdout.write("\t".join(map(str, fields))) for fasta_read in FastaIterator.iterate(IOTools.openFile( options.fastafile)): read_sequence = fasta_read.sequence read_name = fasta_read.title quals = '.' * len(read_sequence) new_fastq = Fastq.Record(identifier=read_name, seq=read_sequence, quals=quals) new_fastq.fromPhred([30] * len(read_sequence), format='illumina-1.8') options.stdout.write(str(new_fastq) + "\n") # write footer and output benchmark information. E.Stop()
def preprocessIdba(infile, outfile): ''' preprocess pooled reads for IDBA ''' # check for second read in the pair if infile.endswith(".fastq.gz"): E.info("converting fastq file to fasta file") outf = open(outfile, "w") for fastq in Fastq.iterate(IOTools.openFile(infile)): outf.write("%s\n%s\n" % (">" + fastq.identifier, fastq.seq)) outf.close() elif infile.endswith(".1.gz"): read2 = P.snip(infile, ".1.gz") + ".2.gz" assert os.path.exists(read2), "file does not exist %s" % read2 statement = '''python %(scriptsdir)s/fastqs2fasta.py -a %(infile)s -b %(read2)s --log=%(infile)s.log > %(outfile)s''' P.run()
def peek(sra, outdir): ''' returns the full file names for all files which will be extracted''' # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) return f, fastq_format
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) prefix = self.prefix offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options assert len(infiles) == len(outfiles) cmds = [] for infile, outfile in zip(infiles, outfiles): cmds.append('''zcat %(infile)s | fastx_trimmer -Q%(offset)s %(processing_options)s 2>> %(output_prefix)s.log | gzip > %(outfile)s ;''' % locals()) return " checkpoint; ".join(cmds)
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) threads = self.threads processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] cmd = '''trimmomatic SE -threads %(threads)i -phred%(offset)s %(infile)s %(outfile)s %(processing_options)s 2>> %(output_prefix)s.log ;''' % locals() elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''trimmomatic PE -threads %(threads)i -phred%(offset)s %(infile1)s %(infile2)s %(outfile1)s %(output_prefix)s.1.unpaired %(outfile2)s %(output_prefix)s.2.unpaired %(processing_options)s 2>> %(output_prefix)s.log; checkpoint; gzip %(output_prefix)s.*.unpaired; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] outdir = os.path.dirname(outfile) trim_out = "%s/%s_trimmed.fq.gz" % ( outdir, infile.replace(".fastq.gz", "")) cmd = '''trim_galore %(processing_options)s --phred%(offset)s --output_dir %(outdir)s %(infile)s 2>>%(output_prefix)s.log; mv %(trim_out)s %(outfile)s; ''' % locals() outfiles = (outfile, ) elif len(infiles) == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles outdir = os.path.dirname(outfile1) cmd = '''trim_galore %(processing_options)s --paired --phred%(offset)s --output_dir %(outdir)s %(infile1)s %(infile2)s 2>>%(output_prefix)s.log; mv %(outdir)s/%(infile1)s_val_1.fq.gz %(outfile1)s; mv %(outdir)s/%(infile2)s_val_2.fq.gz %(outfile2)s; ''' % locals() return cmd
def build(self, infiles, outfiles, output_prefix): assert len(infiles) == len(outfiles) assert len(infiles) in (1, 2) offset = Fastq.getOffset("sanger", raises=False) processing_options = self.processing_options if len(infiles) == 1: infile = infiles[0] outfile = outfiles[0] trim_out = "%s_trimmed.fq.gz" % (output_prefix) cmd = '''trim_galore %(processing_options)s --phred%(offset)s --output_dir %(outdir)s %(infile)s 2>>%(output_prefix)s.log; mv %(trim_out)s %(outfile)s; ''' % locals() outfiles = (outfile,) elif self.num_files == 2: infile1, infile2 = infiles outfile1, outfile2 = outfiles cmd = '''trim_galore %(processing_options)s --paired --phred%(offset)s --output_dir %(outdir)s %(infile1)s %(infile2)s 2>>%(output_prefix)s.log; mv %(infile1)s_val_1.fq.gz %(outfile1)s; mv %(infile2)s_val_2.fq.gz %(outfile2)s; ''' % locals() return cmd
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--split-barcode", dest="split", action="store_true", help="barcode is split across read pair") parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--supress-stats", dest="stats", action="store_false", help="Suppress the writing of stats to the log") parser.set_defaults(split=False, pattern=None, pattern2=None, read2_in=None, read2_out=None, prime3=False, stats=True) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # check options if not options.pattern: raise ValueError("must specify a pattern using ``--bc-pattern``") if options.split: if not options.read2_in: raise ValueError("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern if options.read2_in: if not options.read2_out: raise ValueError("must specify an output for the paired end " "``--read2-out``") # Initialise the processor processor = Extractor(options.pattern, options.pattern2, options.prime3) read1s = Fastq.iterate(options.stdin) if options.read2_in is None: for read in read1s: options.stdout.write(str(processor(read)) + "\n") else: read2s = Fastq.iterate(IOTools.openFile(options.read2_in)) read2_out = IOTools.openFile(options.read2_out, "w") for read1, read2 in zip(read1s, read2s): new_1, new_2 = processor(read1, read2) options.stdout.write(str(new_1) + "\n") read2_out.write(str(new_2) + "\n") # write footer and output benchmark information. if options.stats: options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n") for id in processor.bc_count: options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess \ the quality format of the input fastq file. The user \ can specify the quality format of the input file using \ the --format option. The script will use this format if \ sequences qualities are ambiguous.[default=%default].") parser.add_option( "-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script guesses the quality format of the input \ file and converts quality scores to the destination \ format unless --format is specified [default=%default].") parser.set_defaults(change_format=None, guess_format=None, min_quality=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) min_quality = options.min_quality number_of_reads = 0 number_of_bases = 0 read_lengths = [] read_qualities = [] bases_below_min = 0 for record in iterator: number_of_reads += 1 quals = record.toPhred() length_read = len(quals) number_of_bases += length_read bases_below_min += len([x for x in quals if x < min_quality]) read_lengths.append(length_read) read_qualities.append(np.mean(quals)) mean_length = round(np.mean(read_lengths), 2) median_length = round(np.median(read_lengths), 2) mean_quality = round(np.mean(read_qualities), 2) median_quality = round(np.median(read_qualities), 2) options.stdout.write( "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n" ) options.stdout.write("%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases, str(mean_length), str(median_length), str(mean_quality), str(median_quality), bases_below_min)) E.Stop()
def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters ---------- outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. Returns ------- files : list A list of fastq formatted files that are contained in the archive. format : string The quality score format in the :term:`fastq` formatted files. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) if outdir is None: shutil.rmtree(workdir) return f, fastq_format
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="set quality scores to format " "[default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern-identifier", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.open_file(options.pattern % "csfasta", "w") outfile_qual = IOTools.open_file(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("--guess-format", dest="guess_format", type="choice", choices=( 'sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality format of the input fastq file. The user can specify \ the quality format of the input file using the --format option. The script will use this format if the \ sequence qualities are ambiguous.[default=%default]." ) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=( 'sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will guess the quality format of the input file and convert \ quality scores to the destination format unless --format is specified [default=%default]." ) parser.set_defaults( change_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write("%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)) )) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def preprocess( self, infiles, outfile ): '''build preprocessing statement Build a command line statement that extracts/converts various input formats to fastq formatted files. Mapping qualities are changed to solexa format. returns the statement and the fastq files to map. ''' assert len(infiles) > 0, "no input files for mapping" tmpdir_fastq = P.getTempDir() # create temporary directory again for nodes statement = [ "mkdir -p %s" % tmpdir_fastq ] fastqfiles = [] # get track by extension of outfile track = os.path.splitext( os.path.basename( outfile ) )[0] if self.compress: compress_cmd = "| gzip" extension = ".gz" else: compress_cmd = "" extension = "" for infile in infiles: if infile.endswith( ".export.txt.gz"): # single end illumina export statement.append( """gunzip < %(infile)s | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \ { if ($1 != "") { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);} else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); } printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}' %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) ) elif infile.endswith( ".fa.gz" ): statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() ) fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) ) self.datatype = "fasta" elif infile.endswith( ".sra"): # sneak preview to determine if paired end or single end outdir = P.getTempDir() # --split-files is present in fastq-dump 2.1.7 P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() ) # --split-files will create files called prefix_#.fastq.gz # where # is the read number. # The following cases are: # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz # * special case: unpaired reads in a paired end run end up in prefix.fastq.gz # * special case: if paired reads are stored in a single read, fastq-dump will split. # There might be a joining sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz # You want files 1 and 3. f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) )) ff = [ os.path.basename(x) for x in f ] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" ) elif len(f) == 3: if ff[2].endswith( "_3.fastq.gz"): f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) ) else: f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) ) E.info("sra file contains the following files: %s" % f ) shutil.rmtree( outdir ) fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] ) statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() ) elif infile.endswith( ".fastq.gz" ): format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False) if 'sanger' not in format and self.convert: statement.append( """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) else: E.debug( "%s: assuming quality score format %s" % (infile, format ) ) fastqfiles.append( (infile, ) ) elif infile.endswith( ".csfasta.gz" ): # single end SOLiD data if self.preserve_colourspace: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" if not os.path.exists( quality ): raise ValueError( "no quality file for %s" % infile ) statement.append( """gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() ) statement.append( """gunzip < %(quality)s > %(tmpdir_fastq)s/%(track)s.qual%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.csfasta%s" % (tmpdir_fastq, track, extension ), "%s/%s.qual%s" % (tmpdir_fastq, track, extension) ) ) self.datatype = "solid" else: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s) %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) elif infile.endswith( ".csfasta.F3.gz" ): # paired end SOLiD data if self.preserve_colourspace: bn = P.snip( infile, ".csfasta.F3.gz" ) # order is important - mirrors tophat reads followed by quals f = [] for suffix in ("csfasta.F3", "csfasta.F5", "qual.F3", "qual.F5" ): fn = "%(bn)s.%(suffix)s" % locals() if not os.path.exists( fn + ".gz"): raise ValueError( "expected file %s.gz missing" % fn ) statement.append( """gunzip < %(fn)s.gz %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s""" % locals() ) f.append( "%(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s" % locals() ) fastqfiles.append( f ) self.datatype = "solid" else: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s) %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) elif infile.endswith( ".fastq.1.gz" ): bn = P.snip( infile, ".fastq.1.gz" ) infile2 = "%s.fastq.2.gz" % bn if not os.path.exists( infile2 ): raise ValueError("can not find paired ended file '%s' for '%s'" % (infile2, infile)) format = Fastq.guessFormat( IOTools.openFile( infile ), raises = False ) if 'sanger' not in format: statement.append( """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.1.fastq%(extension)s; gunzip < %(infile2)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.2.fastq%(extension)s """ % locals() ) fastqfiles.append( ("%s/%s.1.fastq%s" % (tmpdir_fastq, track, extension), "%s/%s.2.fastq%s" % (tmpdir_fastq, track, extension) ) ) else: E.debug( "%s: assuming quality score format %s" % (infile, format ) ) fastqfiles.append( (infile, infile2, ) ) else: raise NotImplementedError( "unknown file format %s" % infile ) self.tmpdir_fastq = tmpdir_fastq assert len(fastqfiles) > 0, "no fastq files for mapping" return "; ".join( statement) + ";", fastqfiles
def processReads( infiles, outfile ): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs( infile ) if infile2: track = P.snip( outfile, ".fastq.1.gz" ) outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz" else: track = P.snip( outfile, ".fastq.gz" ) if PARAMS["process_combine_reads"]: E.warn("combining reads cannot be can not be combined with other processing for paired ended reads") if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str,[read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn("if specifying --max-overlap read and fragment length options will be ignored") max_overlap="--max-overlap=%i" % PARAMS["combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len(fragment_options.strip().split(" ")) < 3: E.warn("have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used") max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS["combine_reads_fragment_length"] and PARAMS["combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn("--max-overlap will override the specified read and fragment length options") max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals() else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS["combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS["combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS["combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([track + x for x in [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads") statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False) E.info( "%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset( format, raises = False ) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s' ] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log') do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn( "no filtering specified for %s - nothing done" % infile ) return s.append( "gzip" ) if not infile2: statement = " | ".join( s ) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn( "processing first of pair") # first read pair statement = " | ".join( s ) + " > %(tmpfile1)s" P.run() # second read pair E.warn( "processing second of pair") infile = infile2 statement = " | ".join( s ) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation" ) statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink( tmpfile1 ) os.unlink( tmpfile2 ) os.unlink( tmpfile )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "grep"), help="method to apply [%default]") parser.add_option("--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option("--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option("--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults(method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def setfastqAttr(self, infiles): self.offset = Fastq.getOffset(self.f_format, raises=False)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess \ the quality format of the input fastq file. The user \ can specify the quality format of the input file using \ the --format option. The script will use this format if \ sequences qualities are ambiguous.[default=%default].") parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script guesses the quality format of the input \ file and converts quality scores to the destination \ format unless --format is specified [default=%default].") parser.set_defaults( change_format=None, guess_format=None, min_quality=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.change_format: iterator = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) min_quality = options.min_quality number_of_reads = 0 number_of_bases = 0 read_lengths = [] read_qualities = [] bases_below_min = 0 for record in iterator: number_of_reads += 1 quals = record.toPhred() length_read = len(quals) number_of_bases += length_read bases_below_min += len([x for x in quals if x < min_quality]) read_lengths.append(length_read) read_qualities.append(np.mean(quals)) mean_length = round(np.mean(read_lengths), 2) median_length = round(np.median(read_lengths), 2) mean_quality = round(np.mean(read_qualities), 2) median_quality = round(np.median(read_qualities), 2) options.stdout.write( "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n") options.stdout.write( "%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases, str(mean_length), str(median_length), str(mean_quality), str(median_quality), bases_below_min)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults(method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(IOTools.openFile(fn1)) iter2 = Fastq.iterate(IOTools.openFile(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals # reverse complement s2 = Genomics.complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in offsets.items()]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=( "apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "grep"), help="method to apply [%default]") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option( "--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option( "--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults( method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError( "please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_filename_pattern, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn( "consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bamfile", dest="bamfile", type="string", help="input bamfile to filter reads from") parser.add_option("-r", "--reads", dest="reads", type="choice", choices=("mapped", "unmapped"), help="type of read to keep") parser.add_option("-s", "--scriptsdir", dest="scriptsdir", type="string", help="CGAT scripts directory") parser.add_option("-i", "--invert", dest="invert", action="store_true", help="invert selection - if for example unmapped reads \ aren't output") parser.set_defaults(bamfile=None, reads="mapped", scriptsdir="/ifs/devel/nicki/cgat_git/cgat/scripts", invert=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() c.input_alignments = 0 c.input_reads = 0 c.output_reads = 0 # output text file for reads TO KEEP bam = pysam.Samfile(options.bamfile, "rb") temp = P.getTempFile(".") E.info("iterating over bam file") for alignment in bam.fetch(until_eof=True): c.input_alignments += 1 if options.reads == "unmapped": if alignment.is_unmapped: #c.input_alignments += 1 temp.write(alignment.qname + "\n") elif options.reads == "mapped": if not alignment.is_unmapped: #c.input_alignments += 1 temp.write(alignment.qname + "\n") temp.close() tempname = temp.name E.info("filtering fastq file") # filter fastq file ids = set(IOTools.readList(IOTools.openFile(tempname).readlines())) c.input_alignments = len(ids) for fastq in Fastq.iterate(options.stdin): c.input_reads += 1 if (fastq.identifier.endswith("/1") or fastq.identifier.endswith("/2") ) and " " not in fastq.identifier: identifier = fastq.identifier[:-2] elif len(fastq.identifier.split(" ")) == 2: identifier = fastq.identifier.split(" ")[0] else: identifier = fastq.identifier if not options.invert: if identifier in ids: c.output_reads += 1 options.stdout.write("%s\n" % fastq) else: if identifier in ids: continue c.output_reads += 1 options.stdout.write("%s\n" % fastq) E.info(c) os.unlink(tempname) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults( method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(IOTools.openFile(fn1)) iter2 = Fastq.iterate(IOTools.openFile(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals # reverse complement s2 = Genomics.complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in offsets.items()]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def process_cgat(options): c = E.Counter() assert options.input_fastq_file == "-" if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "reverse-complement": for record in Fastq.iterate(options.stdin): record.seq = Genomics.complement(record.seq) record.quals = record.quals[::-1] options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.open_file(options.output_filename_pattern, "w") for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.open_file(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.read_list(IOTools.open_file(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.open_file(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.open_file(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) return c
def processReads( infiles, outfile ): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs( infile ) if infile2: track = P.snip( outfile, ".fastq.1.gz" ) outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz" else: track = P.snip( outfile, ".fastq.gz" ) if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads") statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False) E.info( "%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset( format, raises = False ) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s' ] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log') do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn( "no filtering specified for %s - nothing done" % infile ) return s.append( "gzip" ) if not infile2: statement = " | ".join( s ) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn( "processing first of pair") # first read pair statement = " | ".join( s ) + " > %(tmpfile1)s" P.run() # second read pair E.warn( "processing second of pair") infile = infile2 statement = " | ".join( s ) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation" ) statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%i.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink( tmpfile1 ) os.unlink( tmpfile2 ) os.unlink( tmpfile )
def processReads(infiles, outfile): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs(infile) if infile2: track = P.snip(outfile, ".fastq.1.gz") outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz" else: track = P.snip(outfile, ".fastq.gz") if PARAMS["process_combine_reads"]: E.warn( "combining reads cannot be can not be combined with other processing for paired ended reads" ) if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn( "if specifying --max-overlap read and fragment length options will be ignored" ) max_overlap = "--max-overlap=%i" % PARAMS[ "combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len( fragment_options.strip().split(" ")) < 3: E.warn( "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used" ) max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS[ "combine_reads_fragment_length"] and PARAMS[ "combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn( "--max-overlap will override the specified read and fragment length options" ) max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals( ) else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS[ "combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS[ "combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[ "combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([ track + x for x in [ ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz" ] ]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads" ) statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat(IOTools.openFile(infile), raises=False) E.info("%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset(format, raises=False) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s'] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log' ) do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn("no filtering specified for %s - nothing done" % infile) return s.append("gzip") if not infile2: statement = " | ".join(s) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn("processing first of pair") # first read pair statement = " | ".join(s) + " > %(tmpfile1)s" P.run() # second read pair E.warn("processing second of pair") infile = infile2 statement = " | ".join(s) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation") statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink(tmpfile1) os.unlink(tmpfile2) os.unlink(tmpfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help= "guess quality score format and set quality scores to format [default=%default]." ) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option( "--apply", dest="apply", type="string", help= "apply a filter to fastq file (taking only reads in filename) [default=%default]." ) parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults(change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)" ) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)" ) E.warn( "consider sorting individual fastq files - this is memory intensive" ) entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len( set(entries1.keys()).intersection(set(entries2.keys())) ) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) ## write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option("--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option("--apply", dest="apply", type="string", help="apply a filter to fastq file (taking only reads in filename) [default=%default].") parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option("--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option("--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults( change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") E.warn( "consider sorting individual fastq files - this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--supress-stats", dest="stats", action="store_false", help="Suppress the writing of stats to the log") parser.set_defaults(pattern=None, read2_in=None, read2_out=None, prime3=False, stats=True) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) #Initialise the processor processor = Extractor(options.pattern, options.prime3) read1s = Fastq.iterate(options.stdin) if options.read2_in is None: for read in read1s: options.stdout.write(str(processor(read)) + "\n") else: read2s = Fastq.iterate(IOTools.openFile(options.read2_in)) read2_out = IOTools.openFile(options.read2_out) for read1, read2 in zip(read1s, read2s): new_1, new_2 = processor(read1, read2) options.stdout.write(str(new_1) + "\n") read2_out.write(str(new_2) + "\n") # write footer and output benchmark information. if options.stats: options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n") for id in processor.bc_count: options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n") E.Stop()