(os.path.join(args.out, file)))
        else:
            ufitslib.log.info(
                "Found %i paired-end files, copying to %s folder" %
                (len(rawlist) / 2, args.out))
            for file in rawlist:
                shutil.copyfile(os.path.join(args.FASTQ, file),
                                (os.path.join(args.out, file)))
                if '_R1' in file:
                    filelist.append(file)

else:
    #count FASTQ records in input
    ufitslib.log.info("Loading FASTQ Records")
    total = ufitslib.countfastq(args.FASTQ)
    size = ufitslib.checkfastqsize(args.FASTQ)
    readablesize = ufitslib.convertSize(size)
    ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')

    #if --names given, load into dictonary
    if args.names:
        with open(args.names, 'rU') as input:
            reader = csv.reader(input)
            namesDict = {col[0]: col[1] for col in reader}
    else:
        ufitslib.log.info("No names csv passed, using BC header names")

    #load barcode fasta file into dictonary
    Barcodes = {}
    files = []
    with open(args.barcodes, 'rU') as input:
                if not RevSeq in RevBarcodes:
                    RevBarcodes[RevSeq] = rec.id
                else:
                    ufitslib.log.error("Duplicate reverse barcodes detected, exiting")
                    sys.exit(1)
    
#get number of CPUs to use
if not args.cpus:
    cpus = multiprocessing.cpu_count()
else:
    cpus = args.cpus

#Count FASTQ records
ufitslib.log.info("Loading FASTQ Records")
orig_total = ufitslib.countfastq(SeqIn)
size = ufitslib.checkfastqsize(SeqIn)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#create tmpdir and split input into n cpus
tmpdir = args.out.split('.')[0]+'_'+str(os.getpid())
if not os.path.exists(tmpdir):
    os.makedirs(tmpdir)
#split the input FASTQ file into chunks to process
with open(SeqIn, 'rU') as input:
    SeqRecords = SeqIO.parse(SeqIn, 'fastq')
    chunks = orig_total / (2*cpus)+1
    #divide into chunks, store in tmp file
    for i, batch in enumerate(ufitslib.batch_iterator(SeqRecords, chunks)) :
        filename = "chunk_%i.fq" % (i+1)
        tmpout = os.path.join(tmpdir, filename)
Exemple #3
0
            ufitslib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), args.out))
            filelist = rawlist
            for file in rawlist:
                shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file)))
        else:
            ufitslib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, args.out))
            for file in rawlist:
                shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file)))
                if '_R1' in file:
                    filelist.append(file)

else:
    #count FASTQ records in input
    ufitslib.log.info("Loading FASTQ Records")
    total = ufitslib.countfastq(args.FASTQ)
    size = ufitslib.checkfastqsize(args.FASTQ)
    readablesize = ufitslib.convertSize(size)
    ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')

    #if --names given, load into dictonary
    if args.names:
        with open(args.names, 'rU') as input:
            reader = csv.reader(input)
            namesDict = {col[0]:col[1] for col in reader}
    else:
        ufitslib.log.info("No names csv passed, using BC header names")

    #load barcode fasta file into dictonary
    Barcodes = {}
    files = []
    with open(args.barcodes, 'rU') as input:
ufitslib.log.debug(cmd_args)
print "-------------------------------------------------------"
#initialize script, log system info and usearch version
ufitslib.SystemInfo()
#Do a version check
usearch = args.usearch
ufitslib.versionDependencyChecks(usearch)

#check dependencies
programs = ['Rscript']
ufitslib.CheckDependencies(programs)

#Count FASTQ records and remove 3' N's as dada2 can't handle them
ufitslib.log.info("Loading FASTQ Records")
orig_total = ufitslib.countfastq(args.fastq)
size = ufitslib.checkfastqsize(args.fastq)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')
no_ns = args.out+'.cleaned_input.fq'
ufitslib.fastq_strip_padding(args.fastq, no_ns)

#quality filter
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
derep = args.out+'.qual-filtered.fq'
filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0']
ufitslib.runSubprocess(filtercmd, ufitslib.log)
total = ufitslib.countfastq(derep)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#Get Average length without any N's
averageLen = getAvgLength(derep)