def start(self, barcodesFile, primerFile, samplesFile, verbose=True, debug=False): """ Start preprocessing double barcoded Illumina sequencing run, perform """ self.verbose = verbose try: # read in barcode sequences bcTable = barcodeTable(barcodesFile) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) if primerFile is not None: prTable = primerTable(primerFile) if verbose: sys.stdout.write( "primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n" % (len(prTable.getP5sequences()), len(prTable.getP7sequences()))) res1 = self.validatePrimer(prTable, debug) else: res1 = 0 prTable = None # read in sample sheet sTable = sampleTable(samplesFile) if verbose: sys.stdout.write( "sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList()))) res2 = self.validateSample(bcTable, prTable, sTable, debug) if res1 == 0 and res2 == 0: sys.stderr.write("Validation confirmed, files are ok\n") return 0 else: sys.stderr.write("Failed validation\n") return 1 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except Exception: self.clean() if not debug: sys.stderr.write( "A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def start(self, barcodesFile, primerFile, samplesFile, verbose=True, debug=False): """ Start preprocessing double barcoded Illumina sequencing run, perform """ self.verbose = verbose try: # read in barcode sequences bcTable = barcodeTable(barcodesFile) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) if primerFile is not None: prTable = primerTable(primerFile) if verbose: sys.stdout.write("primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n" % (len(prTable.getP5sequences()), len(prTable.getP7sequences()))) res1 = self.validatePrimer(prTable, debug) else: res1 = 0 prTable = None # read in sample sheet sTable = sampleTable(samplesFile) if verbose: sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList()))) res2 = self.validateSample(bcTable, prTable, sTable, debug) if res1 == 0 and res2 == 0: sys.stderr.write("Validation confirmed, files are ok\n") return 0 else: sys.stderr.write("Failed validation\n") return 1 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except Exception: self.clean() if not debug: sys.stderr.write("A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1
def start(self, fastq_file1, fastq_file2, fastq_file3, fastq_file4, output_prefix, barcodesFile, primerFile, samplesFile, barcodeMaxDiff=1, I1rc=True, I2rc=False, dedup_float=4, primerMaxDiff=4, primerEndMatch=4, batchsize=10000, uncompressed=False, output_unidentified=False, minQ=None, minL=0, verbose=True, debug=False, kprimer=False, test=False): """ Start preprocessing double barcoded Illumina sequencing run, perform """ self.verbose = verbose evalPrimer = primerFile is not None evalSample = samplesFile is not None try: v = validateApp() # read in barcode sequences bcTable = barcodeTable(barcodesFile, I1rc, I2rc) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) # read in primer sequences if present if evalPrimer: prTable = primerTable(primerFile) if verbose: sys.stdout.write( "primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n" % (len(prTable.getP5sequences()), len(prTable.getP7sequences()))) if v.validatePrimer(prTable, debug) != 0: sys.stderr.write("Failed validation\n") self.clean() return 1 else: prTable = None if evalSample: sTable = sampleTable(samplesFile) if verbose: sys.stdout.write( "sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len( sTable.getProjectList()))) if v.validateSample(bcTable, prTable, sTable, debug) != 0: sys.stderr.write("Failed validation\n") self.clean() return 1 # output table try: if evalSample: bctable_name = os.path.join(output_prefix, 'Identified_Barcodes.txt') else: bctable_name = output_prefix + '_Identified_Barcodes.txt' misc.make_sure_path_exists(os.path.dirname(bctable_name)) bcFile = open(bctable_name, 'w') except: sys.stderr.write("ERROR: Can't open file %s for writing\n" % bctable_name) raise barcode_counts = {} bcsuccesscount = 0 prsuccesscount = 0 sampsuccesscount = 0 trimsuccesscount = 0 identified_count = 0 # setup output files self.run_out = {} if evalSample: for project in sTable.getProjectList(): self.run_out[project] = IlluminaTwoReadOutput( os.path.join(output_prefix, project), uncompressed) else: self.run_out["Identified"] = IlluminaTwoReadOutput( output_prefix, uncompressed) if output_unidentified: if evalSample: self.run_out["Unidentified"] = IlluminaTwoReadOutput( os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed) else: self.run_out["Unidentified"] = IlluminaTwoReadOutput( output_prefix + "_Unidentified", uncompressed) # establish and open the Illumina run self.run = FourReadIlluminaRun(fastq_file1, fastq_file2, fastq_file3, fastq_file4) self.run.open() totaltime = time.time() while 1: lasttime = time.time() # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads for read in reads: bcsuccesscount += read.assignBarcode( bcTable, barcodeMaxDiff) # barcode if evalPrimer: # primer prsuccesscount += read.assignPrimer( prTable, dedup_float, primerMaxDiff, primerEndMatch) if evalSample: # sample sampsuccesscount += read.assignRead( sTable) # barcode + primer if minQ is not None and read.goodRead: trimsuccesscount += read.trimRead(minQ, minL) if read.goodRead is True: identified_count += 1 if evalSample: self.run_out[read.getProject()].addRead( read.getFastq(kprimer)) else: self.run_out["Identified"].addRead( read.getFastq(kprimer)) else: if output_unidentified: self.run_out["Unidentified"].addRead( read.getFastq(True)) ############################################### # Record data for final barcode table if read.getBarcode() is None and '-' in barcode_counts: if evalPrimer and read.getPrimer() is None: barcode_counts['-']['-'] += 1 elif evalPrimer: barcode_counts['-'][read.getPrimer()] += 1 else: barcode_counts['-']["Total"] += 1 elif read.getBarcode() in barcode_counts: if evalPrimer and read.getPrimer() is None: barcode_counts[read.getBarcode()]['-'] += 1 elif evalPrimer: barcode_counts[read.getBarcode()][ read.getPrimer()] += 1 else: barcode_counts[read.getBarcode()]["Total"] += 1 else: # setup blank primer count table for the new barcode if read.getBarcode() is None: barcode_counts['-'] = {} if evalPrimer: for pr in prTable.getPrimers(): barcode_counts['-'][pr] = 0 barcode_counts['-']['-'] = 0 if read.getPrimer() is None: barcode_counts['-']['-'] = 1 else: barcode_counts['-'][read.getPrimer()] = 1 else: barcode_counts['-']["Total"] = 1 else: barcode_counts[read.getBarcode()] = {} if evalPrimer: for pr in prTable.getPrimers(): barcode_counts[read.getBarcode()][pr] = 0 barcode_counts[read.getBarcode()]['-'] = 0 if read.getPrimer() is None: barcode_counts[read.getBarcode()]['-'] = 1 else: barcode_counts[read.getBarcode()][ read.getPrimer()] = 1 else: barcode_counts[read.getBarcode()]["Total"] = 1 # Write out reads for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write( "processed %s total reads, %s Reads/second, %s identified reads(%s%%), %s unidentified reads\n" % (self.run.count(), round(batchsize / (time.time() - lasttime), 0), identified_count, round((float(identified_count) / float(self.run.count())) * 100, 1), self.run.count() - identified_count)) if test: # exit after the first batch to test the inputs break if self.verbose: sys.stdout.write( "%s reads processed in %s minutes, %s (%s%%) identified\n\n" % (self.run.count(), round((time.time() - totaltime) / (60), 2), identified_count, round( (float(identified_count) / float(self.run.count())) * 100, 1))) # Write out barcode and primer table if (identified_count > 0): # write out header line if evalPrimer: txt = 'Barcode\t' + '\t'.join( prTable.getPrimers()) + '\tNone' + '\n' else: txt = 'Barcode\tTotal\n' bcFile.write(txt) bckeys = barcode_counts.keys() for bc in bcTable.getBarcodes(): if bc in bckeys and evalPrimer: txt = str(bc) for pr in prTable.getPrimers(): txt = '\t'.join([txt, str(barcode_counts[bc][pr])]) txt = "\t".join([txt, str(barcode_counts[bc]['-'])]) elif bc in bckeys: txt = "\t".join( [str(bc), str(barcode_counts[bc]["Total"])]) else: continue bcFile.write(txt + '\n') if '-' in bckeys: if evalPrimer: txt = 'None' for pr in prTable.getPrimers(): txt = '\t'.join( [txt, str(barcode_counts['-'][pr])]) txt = "\t".join([txt, str(barcode_counts['-']['-'])]) else: txt = "\t".join( ['None', str(barcode_counts['-']["Total"])]) bcFile.write(txt + '\n') # write out project table sys.stdout.write( "%s reads (%s%% of total run) successfully identified barcode\n" % (bcsuccesscount, round( (float(bcsuccesscount) / float(self.run.count())) * 100, 1))) if evalPrimer: # primer sys.stdout.write( "%s reads (%s%% of total run) successfully identified barcode and primer\n" % (prsuccesscount, round( (float(prsuccesscount) / float(self.run.count())) * 100, 1))) if evalSample: # sample sys.stdout.write( "%s reads (%s%% of total run) successfully assigned to sample\n" % (sampsuccesscount, round( (float(sampsuccesscount) / float(self.run.count())) * 100, 1))) if minQ is not None: sys.stdout.write( "%s reads (%s%% of total run) successfully pass trimming criteria\n" % (trimsuccesscount, round( (float(trimsuccesscount) / float(self.run.count())) * 100, 1))) sys.stdout.write("%s reads (%s%% of total run) unidentified\n\n" % (self.run.count() - identified_count, round( (float(self.run.count() - identified_count) / float(self.run.count())) * 100, 1))) if evalSample and self.verbose: for key in self.run_out: sys.stdout.write( "%s reads (%s%% of total run) found for project\t%s\n" % (self.run_out[key].count(), round((float(self.run_out[key].count()) / float(self.run.count())) * 100, 1), key)) self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except: self.clean() if not debug: sys.stderr.write( "A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def test_primers(self): p = primerTable("primerLookupTable.txt") self.assert_(len(p.P5sequences) == 7 and len(p.P7sequences) == 7)
def start(self, fastq_file1, fastq_file2, fastq_file3, fastq_file4, output_prefix, bcFile, primerFile=None, samplesFile=None, barcodeMaxDiff=1, I1rc=True, I2rc=False, dedup_float=4, primerMaxDiff=4, primerEndMatch=4, flip=False, batchsize=10000, uncompressed=False, output_unidentified=False, minQ=None, minL=0, verbose=True, debug=False, kprimer=False, test=False): """ Start preprocessing double barcoded Illumina sequencing run, perform """ print('---') print('Running preprocessApp') print('') self.verbose = verbose evalPrimer = primerFile is not None evalSample = samplesFile is not None try: v = validateApp() # read in barcode sequences bcTable = barcodeTable(bcFile, I1rc, I2rc) if self.verbose: sys.stdout.write("barcode table length: %s\n" % bcTable.getLength()) # read in primer sequences if present if evalPrimer: prTable = primerTable(primerFile) if verbose: sys.stdout.write("primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n" % (len(prTable.getP5sequences()), len(prTable.getP7sequences()))) if v.validatePrimer(prTable, debug) != 0: sys.stderr.write("Failed validation\n") self.clean() return 1 else: prTable = None if evalSample: sTable = sampleTable(samplesFile) if verbose: sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList()))) if v.validateSample(bcTable, prTable, sTable, debug) != 0: sys.stderr.write("Failed validation\n") self.clean() return 1 # output table try: if evalSample: bctable_name = os.path.join(output_prefix, 'Identified_Barcodes.txt') else: bctable_name = output_prefix + '_Identified_Barcodes.txt' misc.make_sure_path_exists(os.path.dirname(bctable_name)) bcFile = open(bctable_name, 'w') except Exception: sys.stderr.write("ERROR: Can't open file %s for writing\n" % bctable_name) raise barcode_counts = {} bcsuccesscount = 0 prsuccesscount = 0 sampsuccesscount = 0 trimsuccesscount = 0 identified_count = 0 # setup output files self.run_out = {} if evalSample: for project in sTable.getProjectList(): self.run_out[project] = IlluminaTwoReadOutput(os.path.join(output_prefix, project), uncompressed) else: self.run_out["Identified"] = IlluminaTwoReadOutput(output_prefix, uncompressed) if output_unidentified: if evalSample: self.run_out["Unidentified"] = IlluminaTwoReadOutput(os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed) else: self.run_out["Unidentified"] = IlluminaTwoReadOutput(output_prefix + "_Unidentified", uncompressed) # establish and open the Illumina run self.run = FourReadIlluminaRun(fastq_file1, fastq_file2, fastq_file3, fastq_file4) self.run.open() totaltime = time.time() while 1: lasttime = time.time() # get next batch of reads reads = self.run.next(batchsize) if len(reads) == 0: break # process individual reads #for read in reads: #print(read.assignBarcode(bcTable, barcodeMaxDiff)) for read in reads: bcsuccesscount += read.assignBarcode(bcTable, barcodeMaxDiff) # barcode if evalPrimer: # primer prsuccesscount += read.assignPrimer(prTable, dedup_float, primerMaxDiff, primerEndMatch, flip) if evalSample: # sample sampsuccesscount += read.assignRead(sTable) # barcode + primer if minQ is not None and read.goodRead: trimsuccesscount += read.trimRead(minQ, minL) if read.goodRead is True: identified_count += 1 if evalSample: self.run_out[read.getProject()].addRead(read.getFastq(kprimer)) else: self.run_out["Identified"].addRead(read.getFastq(kprimer)) else: if output_unidentified: self.run_out["Unidentified"].addRead(read.getFastq(True)) ############################################### # Record data for final barcode table if read.getBarcode() is None and '-' in barcode_counts: if evalPrimer and read.getPrimer() is None: barcode_counts['-']['-'] += 1 elif evalPrimer: barcode_counts['-'][read.getPrimer()] += 1 else: barcode_counts['-']["Total"] += 1 elif read.getBarcode() in barcode_counts: if evalPrimer and read.getPrimer() is None: barcode_counts[read.getBarcode()]['-'] += 1 elif evalPrimer: barcode_counts[read.getBarcode()][read.getPrimer()] += 1 else: barcode_counts[read.getBarcode()]["Total"] += 1 else: # setup blank primer count table for the new barcode if read.getBarcode() is None: barcode_counts['-'] = {} if evalPrimer: for pr in prTable.getPrimers(): barcode_counts['-'][pr] = 0 barcode_counts['-']['-'] = 0 if read.getPrimer() is None: barcode_counts['-']['-'] = 1 else: barcode_counts['-'][read.getPrimer()] = 1 else: barcode_counts['-']["Total"] = 1 else: barcode_counts[read.getBarcode()] = {} if evalPrimer: for pr in prTable.getPrimers(): barcode_counts[read.getBarcode()][pr] = 0 barcode_counts[read.getBarcode()]['-'] = 0 if read.getPrimer() is None: barcode_counts[read.getBarcode()]['-'] = 1 else: barcode_counts[read.getBarcode()][read.getPrimer()] = 1 else: barcode_counts[read.getBarcode()]["Total"] = 1 # Write out reads for key in self.run_out: self.run_out[key].writeReads() if self.verbose: sys.stderr.write("processed %s total reads, %s Reads/second, %s identified reads(%s%%), %s unidentified reads\n" % (self.run.count(), round(batchsize / (time.time() - lasttime), 0), identified_count, round((float(identified_count) / float(self.run.count())) * 100, 1), self.run.count() - identified_count)) if test: # exit after the first batch to test the inputs break if self.verbose: sys.stdout.write("%s reads processed in %s minutes, %s (%s%%) identified\n\n" % (self.run.count(), round((time.time() - totaltime) / (60), 2), identified_count, round((float(identified_count) / float(self.run.count())) * 100, 1))) # Write out barcode and primer table if (identified_count > 0): # write out header line if evalPrimer: txt = 'Barcode\t' + '\t'.join(prTable.getPrimers()) + '\tNone' + '\n' else: txt = 'Barcode\tTotal\n' bcFile.write(txt) bckeys = barcode_counts.keys() for bc in bcTable.getBarcodes(): if bc in bckeys and evalPrimer: txt = str(bc) for pr in prTable.getPrimers(): txt = '\t'.join([txt, str(barcode_counts[bc][pr])]) txt = "\t".join([txt, str(barcode_counts[bc]['-'])]) elif bc in bckeys: txt = "\t".join([str(bc), str(barcode_counts[bc]["Total"])]) else: continue bcFile.write(txt + '\n') if '-' in bckeys: if evalPrimer: txt = 'None' for pr in prTable.getPrimers(): txt = '\t'.join([txt, str(barcode_counts['-'][pr])]) txt = "\t".join([txt, str(barcode_counts['-']['-'])]) else: txt = "\t".join(['None', str(barcode_counts['-']["Total"])]) bcFile.write(txt + '\n') # write out project table sys.stdout.write("%s reads (%s%% of total run) successfully identified barcode\n" % (bcsuccesscount, round((float(bcsuccesscount) / float(self.run.count())) * 100, 1))) if evalPrimer: # primer sys.stdout.write("%s reads (%s%% of total run) successfully identified barcode and primer\n" % (prsuccesscount, round((float(prsuccesscount) / float(self.run.count())) * 100, 1))) if evalSample: # sample sys.stdout.write("%s reads (%s%% of total run) successfully assigned to sample\n" % (sampsuccesscount, round((float(sampsuccesscount) / float(self.run.count())) * 100, 1))) if minQ is not None: sys.stdout.write("%s reads (%s%% of total run) successfully pass trimming criteria\n" % (trimsuccesscount, round((float(trimsuccesscount) / float(self.run.count())) * 100, 1))) sys.stdout.write("%s reads (%s%% of total run) unidentified\n\n" % (self.run.count() - identified_count, round((float(self.run.count() - identified_count) / float(self.run.count())) * 100, 1))) if evalSample and self.verbose: for key in self.run_out: sys.stdout.write("%s reads (%s%% of total run) found for project\t%s\n" % (self.run_out[key].count(), round((float(self.run_out[key].count()) / float(self.run.count())) * 100, 1), key)) self.clean() return 0 except (KeyboardInterrupt, SystemExit): self.clean() sys.stderr.write("%s unexpectedly terminated\n" % (__name__)) return 1 except Exception: self.clean() if not debug: sys.stderr.write("A fatal error was encountered. trying turning on debug\n") if debug: sys.stderr.write("".join(traceback.format_exception(*sys.exc_info()))) return 1