Example #1
0
def MergeReads(R1, R2, outname, read_length):
    usearch = args.usearch
    pretrim_R1 = outname + '.pretrim_R1.fq'
    pretrim_R2 = outname + '.pretrim_R2.fq'
    ufitslib.log.debug("Removing index 3prime bp 'A' from reads")
    cmd = [
        'vsearch', '--fastq_filter', R1, '--fastq_trunclen',
        str(read_length), '--fastqout', pretrim_R1
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)
    cmd = [
        'vsearch', '--fastq_filter', R2, '--fastq_trunclen',
        str(read_length), '--fastqout', pretrim_R2
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #next run USEARCH mergepe
    merge_out = outname + '.merged.fq'
    skip_for = outname + '.notmerged.R1.fq'
    ufitslib.log.debug("Now merging PE reads")
    cmd = [
        usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads,
        '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-minhsp',
        '12', '-fastq_maxdiffs', '8'
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #now concatenate files for downstream pre-process_illumina.py script
    outname = outname + '.fq'
    final_out = os.path.join(args.out, outname)
    with open(final_out, 'w') as cat_file:
        shutil.copyfileobj(open(merge_out, 'rU'), cat_file)
        if args.rescue_forward == 'on':
            shutil.copyfileobj(open(skip_for, 'rU'), cat_file)

    #count output
    origcount = ufitslib.countfastq(R1)
    finalcount = ufitslib.countfastq(final_out)
    pct_out = finalcount / float(origcount)

    #clean and close up intermediate files
    os.remove(merge_out)
    os.remove(pretrim_R1)
    os.remove(pretrim_R2)
    os.remove(skip_for)
    return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed (' +
                             '{0:.1%}'.format(pct_out) + ')')
def MergeReads(R1, R2, outname, read_length):
    usearch = args.usearch
    pretrim_R1 = outname + '.pretrim_R1.fq'
    pretrim_R2 = outname + '.pretrim_R2.fq'
    ufitslib.log.debug("Removing index 3prime bp 'A' from reads")    
    cmd = ['vsearch', '--fastq_filter', R1, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R1]
    ufitslib.runSubprocess(cmd, ufitslib.log)
    cmd = ['vsearch', '--fastq_filter', R2, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R2]
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #next run USEARCH mergepe
    merge_out = outname + '.merged.fq'
    skip_for = outname + '.notmerged.R1.fq'
    ufitslib.log.debug("Now merging PE reads")
    cmd = [usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for,'-minhsp', '12','-fastq_maxdiffs', '8']
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #now concatenate files for downstream pre-process_illumina.py script
    outname = outname + '.fq'
    final_out = os.path.join(args.out, outname)
    with open(final_out, 'w') as cat_file:
        shutil.copyfileobj(open(merge_out,'rU'), cat_file)
        if args.rescue_forward == 'on':
            shutil.copyfileobj(open(skip_for,'rU'), cat_file)
    
    #count output
    origcount = ufitslib.countfastq(R1)
    finalcount = ufitslib.countfastq(final_out)
    pct_out = finalcount / float(origcount)

    #clean and close up intermediate files
    os.remove(merge_out)
    os.remove(pretrim_R1)
    os.remove(pretrim_R2)
    os.remove(skip_for)
    return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed ('+'{0:.1%}'.format(pct_out)+')')
print "-------------------------------------------------------"

#initialize script, log system info and usearch version
ufitslib.SystemInfo()
#Do a version check
usearch = args.usearch
ufitslib.versionDependencyChecks(usearch)

#make tmp folder
tmp = args.out + '_tmp'
if not os.path.exists(tmp):
    os.makedirs(tmp)

#Count FASTQ records
ufitslib.log.info("Loading FASTQ Records")
orig_total = ufitslib.countfastq(args.FASTQ)
size = checkfastqsize(args.FASTQ)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step and convert to fasta
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
orig_fasta = os.path.join(tmp, args.out + '.orig.fa')
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
    str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
    '--fastq_qmax', '55'
]
ufitslib.runSubprocess(cmd, ufitslib.log)
#finally process reads over number of cpus
ufitslib.runMultiProgress(processRead, file_list, cpus)
print "-------------------------------------------------------"
#Now concatenate all of the demuxed files together
ufitslib.log.info("Concatenating Demuxed Files")

catDemux = args.out + '.demux.fq'
with open(catDemux, 'wb') as outfile:
    for filename in glob.glob(os.path.join(args.out,'*.demux.fq')):
        if filename == catDemux:
            continue
        with open(filename, 'rU') as readfile:
            shutil.copyfileobj(readfile, outfile)
            
ufitslib.log.info("Counting FASTQ Records")
total = ufitslib.countfastq(catDemux)
ufitslib.log.info('{0:,}'.format(total) + ' reads processed')

#now loop through data and find barcoded samples, counting each.....
BarcodeCount = {}
with open(catDemux, 'rU') as input:
    header = itertools.islice(input, 0, None, 4)
    for line in header:
        ID = line.split("=")[-1].split(";")[0]
        if ID not in BarcodeCount:
            BarcodeCount[ID] = 1
        else:
            BarcodeCount[ID] += 1

#now let's count the barcodes found and count the number of times they are found.
barcode_counts = "%30s:  %s" % ('Sample', 'Count')
                shutil.copyfile(os.path.join(args.FASTQ, file),
                                (os.path.join(args.out, file)))
        else:
            ufitslib.log.info(
                "Found %i paired-end files, copying to %s folder" %
                (len(rawlist) / 2, args.out))
            for file in rawlist:
                shutil.copyfile(os.path.join(args.FASTQ, file),
                                (os.path.join(args.out, file)))
                if '_R1' in file:
                    filelist.append(file)

else:
    #count FASTQ records in input
    ufitslib.log.info("Loading FASTQ Records")
    total = ufitslib.countfastq(args.FASTQ)
    size = ufitslib.checkfastqsize(args.FASTQ)
    readablesize = ufitslib.convertSize(size)
    ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')

    #if --names given, load into dictonary
    if args.names:
        with open(args.names, 'rU') as input:
            reader = csv.reader(input)
            namesDict = {col[0]: col[1] for col in reader}
    else:
        ufitslib.log.info("No names csv passed, using BC header names")

    #load barcode fasta file into dictonary
    Barcodes = {}
    files = []
Example #6
0
        if not '_R2' in sorted(rawlist)[1]:
            ufitslib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), args.out))
            filelist = rawlist
            for file in rawlist:
                shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file)))
        else:
            ufitslib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, args.out))
            for file in rawlist:
                shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(args.out,file)))
                if '_R1' in file:
                    filelist.append(file)

else:
    #count FASTQ records in input
    ufitslib.log.info("Loading FASTQ Records")
    total = ufitslib.countfastq(args.FASTQ)
    size = ufitslib.checkfastqsize(args.FASTQ)
    readablesize = ufitslib.convertSize(size)
    ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')

    #if --names given, load into dictonary
    if args.names:
        with open(args.names, 'rU') as input:
            reader = csv.reader(input)
            namesDict = {col[0]:col[1] for col in reader}
    else:
        ufitslib.log.info("No names csv passed, using BC header names")

    #load barcode fasta file into dictonary
    Barcodes = {}
    files = []
Example #7
0
                RevSeq = str(rec.seq.reverse_complement())
                if not RevSeq in RevBarcodes:
                    RevBarcodes[RevSeq] = rec.id
                else:
                    ufitslib.log.error("Duplicate reverse barcodes detected, exiting")
                    sys.exit(1)
    
#get number of CPUs to use
if not args.cpus:
    cpus = multiprocessing.cpu_count()
else:
    cpus = args.cpus

#Count FASTQ records
ufitslib.log.info("Loading FASTQ Records")
orig_total = ufitslib.countfastq(SeqIn)
size = ufitslib.checkfastqsize(SeqIn)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#create tmpdir and split input into n cpus
tmpdir = args.out.split('.')[0]+'_'+str(os.getpid())
if not os.path.exists(tmpdir):
    os.makedirs(tmpdir)
#split the input FASTQ file into chunks to process
with open(SeqIn, 'rU') as input:
    SeqRecords = SeqIO.parse(SeqIn, 'fastq')
    chunks = orig_total / (2*cpus)+1
    #divide into chunks, store in tmp file
    for i, batch in enumerate(ufitslib.batch_iterator(SeqRecords, chunks)) :
        filename = "chunk_%i.fq" % (i+1)
Example #8
0
cmd_args = " ".join(sys.argv)+'\n'
ufitslib.log.debug(cmd_args)
print "-------------------------------------------------------"
#initialize script, log system info and usearch version
ufitslib.SystemInfo()
#Do a version check
usearch = args.usearch
ufitslib.versionDependencyChecks(usearch)

#check dependencies
programs = ['Rscript']
ufitslib.CheckDependencies(programs)

#Count FASTQ records and remove 3' N's as dada2 can't handle them
ufitslib.log.info("Loading FASTQ Records")
orig_total = ufitslib.countfastq(args.fastq)
size = ufitslib.checkfastqsize(args.fastq)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')
no_ns = args.out+'.cleaned_input.fq'
ufitslib.fastq_strip_padding(args.fastq, no_ns)

#quality filter
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
derep = args.out+'.qual-filtered.fq'
filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0']
ufitslib.runSubprocess(filtercmd, ufitslib.log)
total = ufitslib.countfastq(derep)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#Get Average length without any N's
Example #9
0
print "-------------------------------------------------------"

#initialize script, log system info and usearch version
ufitslib.SystemInfo()
#Do a version check
usearch = args.usearch
ufitslib.versionDependencyChecks(usearch)

#make tmp folder
tmp = args.out + '_tmp'
if not os.path.exists(tmp):
    os.makedirs(tmp)

#Count FASTQ records
ufitslib.log.info("Loading FASTQ Records")
orig_total = ufitslib.countfastq(args.FASTQ)
size = checkfastqsize(args.FASTQ)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step and convert to fasta
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
orig_fasta = os.path.join(tmp, args.out+'.orig.fa')
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55']
ufitslib.runSubprocess(cmd, ufitslib.log)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55']
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
Example #10
0
p.join()

print "-------------------------------------------------------"
#Now concatenate all of the demuxed files together
ufitslib.log.info("Concatenating Demuxed Files")

catDemux = args.out + '.demux.fq'
with open(catDemux, 'wb') as outfile:
    for filename in glob.glob(os.path.join(args.out, '*.demux.fq')):
        if filename == catDemux:
            continue
        with open(filename, 'rU') as readfile:
            shutil.copyfileobj(readfile, outfile)

ufitslib.log.info("Counting FASTQ Records")
total = ufitslib.countfastq(catDemux)
ufitslib.log.info('{0:,}'.format(total) + ' reads processed')

#now loop through data and find barcoded samples, counting each.....
BarcodeCount = {}
with open(catDemux, 'rU') as input:
    header = itertools.islice(input, 0, None, 4)
    for line in header:
        ID = line.split("=")[-1].split(";")[0]
        if ID not in BarcodeCount:
            BarcodeCount[ID] = 1
        else:
            BarcodeCount[ID] += 1

#now let's count the barcodes found and count the number of times they are found.
barcode_counts = "%30s:  %s" % ('Sample', 'Count')
    cpus = multiprocessing.cpu_count()
else:
    cpus = args.cpus

#get other values
MAX_PRIMER_MISMATCHES = int(args.primer_mismatch)
LabelPrefix = args.prefix
MinLen = int(args.min_len)
TrimLen = int(args.trim_len)
PL = len(FwdPrimer)
RL = len(RevPrimer)
OutCount = 0

#split the input FASTQ file into chunks to process
with open(SeqIn, 'rU') as input:
    SeqCount = ufitslib.countfastq(SeqIn)
    ufitslib.log.info('{0:,}'.format(SeqCount) + ' records loaded')
    SeqRecords = SeqIO.parse(SeqIn, 'fastq')
    chunks = SeqCount / cpus + 1
    ufitslib.log.info(
        "splitting job over %i cpus, this may still take awhile" % cpus)
    #divide into chunks, store in tmp file
    pid = os.getpid()
    folder = 'ufits_tmp_' + str(pid)
    if not os.path.exists(folder):
        os.makedirs(folder)
    for i, batch in enumerate(ufitslib.batch_iterator(SeqRecords, chunks)):
        filename = "chunk_%i.fq" % (i + 1)
        tmpout = os.path.join(folder, filename)
        handle = open(tmpout, "w")
        count = SeqIO.write(batch, handle, "fastq")