(os.path.join(args.out, file))) else: ufitslib.log.info( "Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, args.out)) for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ, file), (os.path.join(args.out, file))) if '_R1' in file: filelist.append(file) else: #count FASTQ records in input ufitslib.log.info("Loading FASTQ Records") total = ufitslib.countfastq(args.FASTQ) size = ufitslib.checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #if --names given, load into dictonary if args.names: with open(args.names, 'rU') as input: reader = csv.reader(input) namesDict = {col[0]: col[1] for col in reader} else: ufitslib.log.info("No names csv passed, using BC header names") #load barcode fasta file into dictonary Barcodes = {} files = [] with open(args.barcodes, 'rU') as input:
if not RevSeq in RevBarcodes: RevBarcodes[RevSeq] = rec.id else: ufitslib.log.error("Duplicate reverse barcodes detected, exiting") sys.exit(1) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #Count FASTQ records ufitslib.log.info("Loading FASTQ Records") orig_total = ufitslib.countfastq(SeqIn) size = ufitslib.checkfastqsize(SeqIn) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0]+'_'+str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #split the input FASTQ file into chunks to process with open(SeqIn, 'rU') as input: SeqRecords = SeqIO.parse(SeqIn, 'fastq') chunks = orig_total / (2*cpus)+1 #divide into chunks, store in tmp file for i, batch in enumerate(ufitslib.batch_iterator(SeqRecords, chunks)) : filename = "chunk_%i.fq" % (i+1) tmpout = os.path.join(tmpdir, filename)
ufitslib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), args.out)) filelist = rawlist for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file))) else: ufitslib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, args.out)) for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file))) if '_R1' in file: filelist.append(file) else: #count FASTQ records in input ufitslib.log.info("Loading FASTQ Records") total = ufitslib.countfastq(args.FASTQ) size = ufitslib.checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #if --names given, load into dictonary if args.names: with open(args.names, 'rU') as input: reader = csv.reader(input) namesDict = {col[0]:col[1] for col in reader} else: ufitslib.log.info("No names csv passed, using BC header names") #load barcode fasta file into dictonary Barcodes = {} files = [] with open(args.barcodes, 'rU') as input:
ufitslib.log.debug(cmd_args) print "-------------------------------------------------------" #initialize script, log system info and usearch version ufitslib.SystemInfo() #Do a version check usearch = args.usearch ufitslib.versionDependencyChecks(usearch) #check dependencies programs = ['Rscript'] ufitslib.CheckDependencies(programs) #Count FASTQ records and remove 3' N's as dada2 can't handle them ufitslib.log.info("Loading FASTQ Records") orig_total = ufitslib.countfastq(args.fastq) size = ufitslib.checkfastqsize(args.fastq) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') no_ns = args.out+'.cleaned_input.fq' ufitslib.fastq_strip_padding(args.fastq, no_ns) #quality filter ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = args.out+'.qual-filtered.fq' filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0'] ufitslib.runSubprocess(filtercmd, ufitslib.log) total = ufitslib.countfastq(derep) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #Get Average length without any N's averageLen = getAvgLength(derep)