def start(self, fastq_file1, fastq_file2, barcode1, barcode2, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Start conversion of double barcoded Illumina sequencing run from two to four reads """ self.verbose = verbose try: # setup output files self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: self.run_out.addRead( read.getFourReads(bc1_length=barcode1, bc2_length=barcode2)) # Write out reads self.run_out.writeReads() if self.verbose: sys.stderr.write( "processed %s total reads, %s Reads/second\n" % (self.run.count(), round(self.run.count() / (time.time() - lasttime), 0))) if self.verbose: sys.stdout.write( "%s reads processed in %s minutes\n" % (self.run.count(), round( (time.time() - lasttime) / (60), 2))) # write out project table self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, output_prefix, samplesFile, batchsize=10000, uncompressed=False, output_unidentified=False, verbose=True, debug=False): """ split a double barcoded Illumina Sequencing Run by project """ self.verbose = verbose try: # read in primer sequences sTable = sampleTable(samplesFile) if self.verbose: sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList()))) # read in primer sequences if present # setup output files identified_count = 0 unidentified_count = 0 self.run_out = {} for project in sTable.getProjectList(): self.run_out[project] = IlluminaTwoReadOutput(os.path.join(output_prefix, project), uncompressed) if output_unidentified: self.run_out["Unidentified"] = IlluminaTwoReadOutput(os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: read.assignRead(sTable) # barcode if read.goodRead is True: self.run_out[read.project].addRead(read.getFastq()) identified_count += 1 else: unidentified_count += 1 if output_unidentified: self.run_out["Unidentified"].addRead(read.getFastq()) # Write out reads for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write("processed %s total reads, %s Reads/second, %s identified reads, %s unidentified reads (%s%%)\n" % (self.run.count(), round(self.run.count()/(time.time() - lasttime), 0), identified_count, unidentified_count, round((float(identified_count)/float(self.run.count()))*100))) if self.verbose: sys.stdout.write("%s reads processed in %s minutes, %s (%s%%) identified\n\n" % (self.run.count(), round((time.time()-lasttime)/(60), 2), identified_count, round((float(identified_count)/float(self.run.count()))*100, 1))) for key in self.run_out: sys.stdout.write("%s (%s%%)\treads found for project\t%s\n" % (self.run_out[key].count(), round((float(self.run_out[key].count())/float(self.run.count()))*100, 1), key)) self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() if not debug: sys.stderr.write("A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, barcode1, barcode2, barcodesFile, max_diff, flip, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Start conversion of double barcoded Illumina sequencing run from two to four reads """ self.verbose = verbose try: # read in barcode sequences bcTable = barcodeTable(barcodesFile, i1_rc=False) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) # setup output files self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() failed_reads = 0 flipped_reads = 0 lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: tmp = read.getFourReadsInline(bcTable, bc1_length=barcode1, bc2_length=barcode2, max_diff=max_diff, flip=flip) if len(tmp) == 0: # failed read failed_reads += 1 continue if tmp[4]: flipped_reads += 1 self.run_out.addRead(tmp[0:4]) # Write out reads self.run_out.writeReads() if self.verbose: sys.stderr.write("processed %s total reads, %s failed reads, %s flipped reads, %s Reads/second\n" % (self.run.count(), failed_reads, flipped_reads, round(self.run.count() / (time.time() - lasttime), 0))) if self.verbose: sys.stdout.write("%s reads processed, %s failed reads, %s flipped reads in %s minutes\n" % (self.run.count(), failed_reads, flipped_reads, round((time.time() - lasttime) / (60), 2))) # write out project table self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except Exception: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1
class convertApp: """ Convert two read Illumina files (barcodes processed) back to a four read set to processed with dbcAmplicons """ def __init__(self): self.verbose = False def start(self, fastq_file1, fastq_file2, barcode1, barcode2, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Start conversion of double barcoded Illumina sequencing run from two to four reads """ self.verbose = verbose try: # setup output files self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: self.run_out.addRead(read.getFourReads(bc1_length=barcode1, bc2_length=barcode2)) # Write out reads self.run_out.writeReads() if self.verbose: sys.stderr.write("processed %s total reads, %s Reads/second\n" % (self.run.count(), round(self.run.count()/(time.time() - lasttime), 0))) if self.verbose: sys.stdout.write("%s reads processed in %s minutes\n" % (self.run.count(), round((time.time()-lasttime)/(60), 2))) # write out project table self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1 def clean(self): if self.verbose: sys.stderr.write("Cleaning up.\n") try: self.run.close() self.run_out.close() except: pass
def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, rdpPath, gene='16srrna', train=None, batchsize=10000, minQ=None, minL=0, procs=1, test=False, verbose=True, debug=False): """ Start classifying double barcoded Illumina sequencing run """ results = {} self.verbose = verbose try: if (train is None and gene != '16srrna' and gene != 'fungallsu' and gene != "fungalits_warcup" and gene != "fungalits_unite"): sys.stderr.write( "ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n" ) raise Exception # establish and open the Illumina run if fastq_file1 is not None and fastq_file2 is not None: self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.runPairs.open() else: self.runPairs = None if fastq_fileU is not None: self.runSingle = OneReadIlluminaRun(fastq_fileU) self.runSingle.open() else: self.runSingle = None if self.runPairs is None and self.runSingle is None: sys.stderr.write( "ERROR:[classify] input reads not specified, or incorrect pairs\n" ) raise Exception lasttime = time.time() batch = 0 pool = Pool(procs, maxtasksperchild=1) #For OneReadIllumina: if (self.runSingle is not None): while 1: # get next batch of reads reads = self.runSingle.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getFasta()) else: run_out.addRead(read.getFasta()) # Write out reads rcount = run_out.count() if rcount > batchsize: sys.stderr.write( "WARNING:[classify] output count exceeds batch count" ) run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async( rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break #For TwoReadIllumina: if (self.runPairs is not None): while 1: # get next batch of reads reads = self.runPairs.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getJoinedFasta()) else: run_out.addRead(read.getJoinedFasta()) # Write out reads run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async( rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break allfinished = False while not allfinished: time.sleep(1) np = check_status(results) if np == 0: allfinished = True if self.verbose: sys.stderr.write("Combining temporary files\n") with open(output_prefix + ".fixrank", "wb") as outfile: for f in results.keys(): with open(f, "rb") as infile: outfile.write(infile.read()) os.remove(f) if self.verbose: sys.stdout.write( "%s reads processed in %s minutes\n" % (batch, round((time.time() - lasttime) / (60), 2))) self.clean(results) return 0 except (KeyboardInterrupt, SystemExit): self.clean(results) sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean(results) sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, rdpPath, gene='16srrna', train=None, batchsize=10000, minQ=None, minL=0, procs=1, test=False, verbose=True, debug=False): """ Start classifying double barcoded Illumina sequencing run """ results = {} self.verbose = verbose try: if (train is None and gene != '16srrna' and gene != 'fungallsu' and gene != "fungalits_warcup" and gene != "fungalits_unite"): sys.stderr.write("ERROR:[classify] parameter -g (--gene) must be one of 16srrna or fungallsu or fungalits_warcup or fungalits_unite \n") raise Exception # establish and open the Illumina run if fastq_file1 is not None and fastq_file2 is not None: self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.runPairs.open() else: self.runPairs = None if fastq_fileU is not None: self.runSingle = OneReadIlluminaRun(fastq_fileU) self.runSingle.open() else: self.runSingle = None if self.runPairs is None and self.runSingle is None: sys.stderr.write("ERROR:[classify] input reads not specified, or incorrect pairs\n") raise Exception lasttime = time.time() batch = 0 pool = Pool(procs, maxtasksperchild=1) #For OneReadIllumina: if (self.runSingle is not None): while 1: # get next batch of reads reads = self.runSingle.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getFasta()) else: run_out.addRead(read.getFasta()) # Write out reads rcount = run_out.count() if rcount > batchsize: sys.stderr.write("WARNING:[classify] output count exceeds batch count") run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break #For TwoReadIllumina: if (self.runPairs is not None): while 1: # get next batch of reads reads = self.runPairs.next(batchsize) batch = batch + len(reads) if len(reads) == 0: break run_out = IlluminaFastaOutput(output_prefix + "." + str(batch)) # process individual reads for read in reads: if minQ != 0 or minL != 0: read.trimRead(minQ, minL) if read.goodRead == True: run_out.addRead(read.getJoinedFasta()) else: run_out.addRead(read.getJoinedFasta()) # Write out reads run_out.writeReads() rdp_out = output_prefix + "." + str(batch) + ".fixrank" results[rdp_out] = pool.apply_async(rdpCall, (run_out.output_prefix, rdp_out, gene, train, rdpPath, self.verbose)) if test: break allfinished = False while not allfinished: time.sleep(1) np = check_status(results) if np == 0: allfinished = True if self.verbose: sys.stderr.write("Combining temporary files\n") with open(output_prefix + ".fixrank", "wb") as outfile: for f in results.keys(): with open(f, "rb") as infile: outfile.write(infile.read()) os.remove(f) if self.verbose: sys.stdout.write("%s reads processed in %s minutes\n" % (batch, round((time.time() - lasttime) / (60), 2))) self.clean(results) return 0 except (KeyboardInterrupt, SystemExit): self.clean(results) sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean(results) sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1
def preprocPair_with_inlineBC(self, fastq_file1, fastq_file2, barcode1, barcode2, bcFile, max_diff, flip_float, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Start conversion of double barcoded Illumina sequencing run from two to four reads """ print('---') print('Running preprocPair_with_inlineBC') print('') self.verbose = verbose try: # read in barcode sequences bcTable = barcodeTable(bcFile, i1_rc=False) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) # setup output files self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() failed_reads = 0 flipped_reads = 0 lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: tmp = read.getFourReadsInline(bcTable, bc1_length=barcode1, bc2_length=barcode2, max_diff=max_diff, flip=flip_float) if len(tmp) == 0: # failed read failed_reads += 1 continue if tmp[4]: flipped_reads += 1 self.run_out.addRead(tmp[0:4]) # Write out reads self.run_out.writeReads() if self.verbose: sys.stderr.write( "processed %s total reads, %s failed reads, %s flipped reads, %s Reads/second\n" % (self.run.count(), failed_reads, flipped_reads, round(self.run.count() / (time.time() - lasttime), 0))) if self.verbose: sys.stdout.write( "%s reads processed, %s failed reads, %s flipped reads in %s minutes\n" % (self.run.count(), failed_reads, flipped_reads, round((time.time() - lasttime) / (60), 2))) # write out project table fastq_file1 = output_prefix + '_R1.fastq.gz' fastq_file2 = output_prefix + '_R2.fastq.gz' fastq_file3 = output_prefix + '_R3.fastq.gz' fastq_file4 = output_prefix + '_R4.fastq.gz' return [fastq_file1, fastq_file2, fastq_file3, fastq_file4] self.clean() except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1 print(run_out)
class convertApp: """ Convert two read Illumina files (barcodes processed) back to a four read set to processed with dbcAmplicons """ def __init__(self): self.verbose = False def start(self, fastq_file1, fastq_file2, barcode1, barcode2, barcodesFile, max_diff, flip, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Start conversion of double barcoded Illumina sequencing run from two to four reads """ self.verbose = verbose try: # read in barcode sequences bcTable = barcodeTable(barcodesFile) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) # setup output files self.run_out = IlluminaFourReadOutput(output_prefix, uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() failed_reads = 0 flipped_reads = 0 lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: tmp = read.getFourReadsInline(bcTable, bc1_length=barcode1, bc2_length=barcode2, max_diff=max_diff, flip=flip) if len(tmp) == 0: # failed read failed_reads += 1 continue if tmp[4]: flipped_reads += 1 self.run_out.addRead(tmp[0:4]) # Write out reads self.run_out.writeReads() if self.verbose: sys.stderr.write( "processed %s total reads, %s failed reads, %s flipped reads, %s Reads/second\n" % (self.run.count(), failed_reads, flipped_reads, round(self.run.count() / (time.time() - lasttime), 0))) if self.verbose: sys.stdout.write( "%s reads processed, %s failed reads, %s flipped reads in %s minutes\n" % (self.run.count(), failed_reads, flipped_reads, round((time.time() - lasttime) / (60), 2))) # write out project table self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1 def clean(self): if self.verbose: sys.stderr.write("Cleaning up.\n") try: self.run.close() self.run_out.close() except: pass
def start(self, fastq_file1, fastq_file2, fastq_fileU, output_prefix, batchsize=100000, uncompressed=False, verbose=True, debug=False): """ Split double barcoded Illumina sequencing run from two to four reads by sample identifier """ self.verbose = verbose if fastq_fileU is not None and (fastq_file1 is not None and fastq_file2 is not None): sys.stderr.write( "ERROR:[SplitBySample] cannot have both paired and single reads\n" ) return 1 try: if fastq_file1 is not None and fastq_file2 is not None: self.runPairs = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.runPairs.open() else: self.runPairs = None if fastq_fileU is not None: self.runSingle = OneReadIlluminaRun(fastq_fileU) self.runSingle.open() else: self.runSingle = None if self.runPairs is None and self.runSingle is None: sys.stderr.write( "ERROR:[SplitBySample] input reads not specified, or incorrect pairs\n" ) raise Exception self.run_out = {} if (self.runPairs is not None): while 1: if self.verbose: sys.stderr.write("Processing sequence files.\n") # get next batch of reads reads = self.runPairs.next(batchsize) if len(reads) == 0: break # process individual reads, check to see if sample was already added to the library of self.run_out for read in reads: sample = read.sample if sample in self.run_out: self.run_out[sample].addRead(read.getFastqSRA()) else: self.run_out[sample] = IlluminaTwoReadOutput( os.path.join(output_prefix, sample), uncompressed) self.run_out[sample].addRead(read.getFastqSRA()) # Write out reads for each key in dictionary for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write("\nSplit out %s total samples in %s.\n" % (len(self.run_out), output_prefix)) return 0 if (self.runSingle is not None): while 1: if self.verbose: sys.stderr.write("Processing sequence files.\n") # get next batch of reads reads = self.runSingle.next(batchsize) if len(reads) == 0: break # process individual reads, check to see if sample was already added to the library of self.run_out for read in reads: sample = read.sample if sample in self.run_out: self.run_out[sample].addRead(read.getFastqSRA()) else: self.run_out[sample] = IlluminaOneReadOutput( os.path.join(output_prefix, sample), uncompressed) self.run_out[sample].addRead(read.getFastqSRA()) # Write out reads for each key in dictionary for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write("\nSplit out %s total samples in %s.\n" % (len(self.run_out), output_prefix)) return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated.\n" % (__name__)) return 1 except: self.clean() sys.stderr.write("A fatal error was encountered.\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, output_prefix, samplesFile, batchsize=10000, uncompressed=False, output_unidentified=False, verbose=True, debug=False): """ split a double barcoded Illumina Sequencing Run by project """ self.verbose = verbose try: # read in primer sequences sTable = sampleTable(samplesFile) if self.verbose: sys.stdout.write( "sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList()))) # read in primer sequences if present # setup output files identified_count = 0 unidentified_count = 0 self.run_out = {} for project in sTable.getProjectList(): self.run_out[project] = IlluminaTwoReadOutput( os.path.join(output_prefix, project), uncompressed) if output_unidentified: self.run_out["Unidentified"] = IlluminaTwoReadOutput( os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed) # establish and open the Illumin run self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2) self.run.open() lasttime = time.time() while 1: # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: read.assignRead(sTable) # barcode if read.goodRead is True: self.run_out[read.project].addRead(read.getFastq()) identified_count += 1 else: unidentified_count += 1 if output_unidentified: self.run_out["Unidentified"].addRead( read.getFastq()) # Write out reads for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write( "processed %s total reads, %s Reads/second, %s identified reads, %s unidentified reads (%s%%)\n" % (self.run.count(), round(self.run.count() / (time.time() - lasttime), 0), identified_count, unidentified_count, round((float(identified_count) / float(self.run.count())) * 100))) if self.verbose: sys.stdout.write( "%s reads processed in %s minutes, %s (%s%%) identified\n\n" % (self.run.count(), round( (time.time() - lasttime) / (60), 2), identified_count, round( (float(identified_count) / float(self.run.count())) * 100, 1))) for key in self.run_out: sys.stdout.write( "%s (%s%%)\treads found for project\t%s\n" % (self.run_out[key].count(), round((float(self.run_out[key].count()) / float(self.run.count())) * 100, 1), key)) self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() if not debug: sys.stderr.write( "A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1