def MergeReads(R1, R2, outname, read_length): usearch = args.usearch pretrim_R1 = outname + '.pretrim_R1.fq' pretrim_R2 = outname + '.pretrim_R2.fq' ufitslib.log.debug("Removing index 3prime bp 'A' from reads") cmd = [ 'vsearch', '--fastq_filter', R1, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R1 ] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = [ 'vsearch', '--fastq_filter', R2, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R2 ] ufitslib.runSubprocess(cmd, ufitslib.log) #next run USEARCH mergepe merge_out = outname + '.merged.fq' skip_for = outname + '.notmerged.R1.fq' ufitslib.log.debug("Now merging PE reads") cmd = [ usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-minhsp', '12', '-fastq_maxdiffs', '8' ] ufitslib.runSubprocess(cmd, ufitslib.log) #now concatenate files for downstream pre-process_illumina.py script outname = outname + '.fq' final_out = os.path.join(args.out, outname) with open(final_out, 'w') as cat_file: shutil.copyfileobj(open(merge_out, 'rU'), cat_file) if args.rescue_forward == 'on': shutil.copyfileobj(open(skip_for, 'rU'), cat_file) #count output origcount = ufitslib.countfastq(R1) finalcount = ufitslib.countfastq(final_out) pct_out = finalcount / float(origcount) #clean and close up intermediate files os.remove(merge_out) os.remove(pretrim_R1) os.remove(pretrim_R2) os.remove(skip_for) return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed (' + '{0:.1%}'.format(pct_out) + ')')
def MergeReads(R1, R2, outname, read_length): usearch = args.usearch pretrim_R1 = outname + '.pretrim_R1.fq' pretrim_R2 = outname + '.pretrim_R2.fq' ufitslib.log.debug("Removing index 3prime bp 'A' from reads") cmd = ['vsearch', '--fastq_filter', R1, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R1] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = ['vsearch', '--fastq_filter', R2, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R2] ufitslib.runSubprocess(cmd, ufitslib.log) #next run USEARCH mergepe merge_out = outname + '.merged.fq' skip_for = outname + '.notmerged.R1.fq' ufitslib.log.debug("Now merging PE reads") cmd = [usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for,'-minhsp', '12','-fastq_maxdiffs', '8'] ufitslib.runSubprocess(cmd, ufitslib.log) #now concatenate files for downstream pre-process_illumina.py script outname = outname + '.fq' final_out = os.path.join(args.out, outname) with open(final_out, 'w') as cat_file: shutil.copyfileobj(open(merge_out,'rU'), cat_file) if args.rescue_forward == 'on': shutil.copyfileobj(open(skip_for,'rU'), cat_file) #count output origcount = ufitslib.countfastq(R1) finalcount = ufitslib.countfastq(final_out) pct_out = finalcount / float(origcount) #clean and close up intermediate files os.remove(merge_out) os.remove(pretrim_R1) os.remove(pretrim_R2) os.remove(skip_for) return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed ('+'{0:.1%}'.format(pct_out)+')')
print "-------------------------------------------------------" #initialize script, log system info and usearch version ufitslib.SystemInfo() #Do a version check usearch = args.usearch ufitslib.versionDependencyChecks(usearch) #make tmp folder tmp = args.out + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records ufitslib.log.info("Loading FASTQ Records") orig_total = ufitslib.countfastq(args.FASTQ) size = checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step and convert to fasta filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') orig_fasta = os.path.join(tmp, args.out + '.orig.fa') ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55' ] ufitslib.runSubprocess(cmd, ufitslib.log)
#finally process reads over number of cpus ufitslib.runMultiProgress(processRead, file_list, cpus) print "-------------------------------------------------------" #Now concatenate all of the demuxed files together ufitslib.log.info("Concatenating Demuxed Files") catDemux = args.out + '.demux.fq' with open(catDemux, 'wb') as outfile: for filename in glob.glob(os.path.join(args.out,'*.demux.fq')): if filename == catDemux: continue with open(filename, 'rU') as readfile: shutil.copyfileobj(readfile, outfile) ufitslib.log.info("Counting FASTQ Records") total = ufitslib.countfastq(catDemux) ufitslib.log.info('{0:,}'.format(total) + ' reads processed') #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(catDemux, 'rU') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=")[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%30s: %s" % ('Sample', 'Count')
shutil.copyfile(os.path.join(args.FASTQ, file), (os.path.join(args.out, file))) else: ufitslib.log.info( "Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, args.out)) for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ, file), (os.path.join(args.out, file))) if '_R1' in file: filelist.append(file) else: #count FASTQ records in input ufitslib.log.info("Loading FASTQ Records") total = ufitslib.countfastq(args.FASTQ) size = ufitslib.checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #if --names given, load into dictonary if args.names: with open(args.names, 'rU') as input: reader = csv.reader(input) namesDict = {col[0]: col[1] for col in reader} else: ufitslib.log.info("No names csv passed, using BC header names") #load barcode fasta file into dictonary Barcodes = {} files = []
if not '_R2' in sorted(rawlist)[1]: ufitslib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), args.out)) filelist = rawlist for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file))) else: ufitslib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, args.out)) for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file))) if '_R1' in file: filelist.append(file) else: #count FASTQ records in input ufitslib.log.info("Loading FASTQ Records") total = ufitslib.countfastq(args.FASTQ) size = ufitslib.checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #if --names given, load into dictonary if args.names: with open(args.names, 'rU') as input: reader = csv.reader(input) namesDict = {col[0]:col[1] for col in reader} else: ufitslib.log.info("No names csv passed, using BC header names") #load barcode fasta file into dictonary Barcodes = {} files = []
RevSeq = str(rec.seq.reverse_complement()) if not RevSeq in RevBarcodes: RevBarcodes[RevSeq] = rec.id else: ufitslib.log.error("Duplicate reverse barcodes detected, exiting") sys.exit(1) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #Count FASTQ records ufitslib.log.info("Loading FASTQ Records") orig_total = ufitslib.countfastq(SeqIn) size = ufitslib.checkfastqsize(SeqIn) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0]+'_'+str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #split the input FASTQ file into chunks to process with open(SeqIn, 'rU') as input: SeqRecords = SeqIO.parse(SeqIn, 'fastq') chunks = orig_total / (2*cpus)+1 #divide into chunks, store in tmp file for i, batch in enumerate(ufitslib.batch_iterator(SeqRecords, chunks)) : filename = "chunk_%i.fq" % (i+1)
cmd_args = " ".join(sys.argv)+'\n' ufitslib.log.debug(cmd_args) print "-------------------------------------------------------" #initialize script, log system info and usearch version ufitslib.SystemInfo() #Do a version check usearch = args.usearch ufitslib.versionDependencyChecks(usearch) #check dependencies programs = ['Rscript'] ufitslib.CheckDependencies(programs) #Count FASTQ records and remove 3' N's as dada2 can't handle them ufitslib.log.info("Loading FASTQ Records") orig_total = ufitslib.countfastq(args.fastq) size = ufitslib.checkfastqsize(args.fastq) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') no_ns = args.out+'.cleaned_input.fq' ufitslib.fastq_strip_padding(args.fastq, no_ns) #quality filter ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = args.out+'.qual-filtered.fq' filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0'] ufitslib.runSubprocess(filtercmd, ufitslib.log) total = ufitslib.countfastq(derep) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #Get Average length without any N's
print "-------------------------------------------------------" #initialize script, log system info and usearch version ufitslib.SystemInfo() #Do a version check usearch = args.usearch ufitslib.versionDependencyChecks(usearch) #make tmp folder tmp = args.out + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records ufitslib.log.info("Loading FASTQ Records") orig_total = ufitslib.countfastq(args.FASTQ) size = checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step and convert to fasta filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') orig_fasta = os.path.join(tmp, args.out+'.orig.fa') ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55'] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55'] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfastq(filter_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
p.join() print "-------------------------------------------------------" #Now concatenate all of the demuxed files together ufitslib.log.info("Concatenating Demuxed Files") catDemux = args.out + '.demux.fq' with open(catDemux, 'wb') as outfile: for filename in glob.glob(os.path.join(args.out, '*.demux.fq')): if filename == catDemux: continue with open(filename, 'rU') as readfile: shutil.copyfileobj(readfile, outfile) ufitslib.log.info("Counting FASTQ Records") total = ufitslib.countfastq(catDemux) ufitslib.log.info('{0:,}'.format(total) + ' reads processed') #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(catDemux, 'rU') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=")[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%30s: %s" % ('Sample', 'Count')
cpus = multiprocessing.cpu_count() else: cpus = args.cpus #get other values MAX_PRIMER_MISMATCHES = int(args.primer_mismatch) LabelPrefix = args.prefix MinLen = int(args.min_len) TrimLen = int(args.trim_len) PL = len(FwdPrimer) RL = len(RevPrimer) OutCount = 0 #split the input FASTQ file into chunks to process with open(SeqIn, 'rU') as input: SeqCount = ufitslib.countfastq(SeqIn) ufitslib.log.info('{0:,}'.format(SeqCount) + ' records loaded') SeqRecords = SeqIO.parse(SeqIn, 'fastq') chunks = SeqCount / cpus + 1 ufitslib.log.info( "splitting job over %i cpus, this may still take awhile" % cpus) #divide into chunks, store in tmp file pid = os.getpid() folder = 'ufits_tmp_' + str(pid) if not os.path.exists(folder): os.makedirs(folder) for i, batch in enumerate(ufitslib.batch_iterator(SeqRecords, chunks)): filename = "chunk_%i.fq" % (i + 1) tmpout = os.path.join(folder, filename) handle = open(tmpout, "w") count = SeqIO.write(batch, handle, "fastq")