Ejemplo n.º 1
0
def splitter(inputfile, tempdir):
    '''
    this function splits a fastq file into equal parts into a temporary directory
    and then returns the file names in a list
    '''
    total = amptklib.countfastq(inputfile)
    #split the input FASTQ file into chunks to process
    with open(inputfile, 'rU') as input:
        SeqRecords = SeqIO.parse(input, 'fastq')
        chunks = total / (4 * cpus) + 1
        #divide into chunks, store in tmp file
        for i, batch in enumerate(amptklib.batch_iterator(SeqRecords, chunks)):
            filename = "chunk_%i.fq" % (i + 1)
            tmpout = os.path.join(tempdir, filename)
            with open(tmpout, 'w') as handle:
                SeqIO.write(batch, handle, "fastq")
    #now get file list from tmp folder
    file_list = []
    for file in os.listdir(tempdir):
        if file.endswith(".fq"):
            file = os.path.join(tempdir, file)
            file_list.append(file)
    return file_list
Ejemplo n.º 2
0
    cpus = multiprocessing.cpu_count()
else:
    cpus = args.cpus

#get other values
MAX_PRIMER_MISMATCHES = int(args.primer_mismatch)
LabelPrefix = args.prefix
MinLen = int(args.min_len)
TrimLen = int(args.trim_len)
PL = len(FwdPrimer)
RL = len(RevPrimer)
OutCount = 0

#split the input FASTQ file into chunks to process
with open(SeqIn, 'rU') as input:
    SeqCount = amptklib.countfastq(SeqIn)
    amptklib.log.info('{0:,}'.format(SeqCount) + ' records loaded')
    SeqRecords = SeqIO.parse(SeqIn, 'fastq')
    chunks = SeqCount / cpus + 1
    amptklib.log.info(
        "splitting job over %i cpus, this may still take awhile" % cpus)
    #divide into chunks, store in tmp file
    pid = os.getpid()
    folder = 'amptk_tmp_' + str(pid)
    if not os.path.exists(folder):
        os.makedirs(folder)
    for i, batch in enumerate(amptklib.batch_iterator(SeqRecords, chunks)):
        filename = "chunk_%i.fq" % (i + 1)
        tmpout = os.path.join(folder, filename)
        handle = open(tmpout, "w")
        count = SeqIO.write(batch, handle, "fastq")
Ejemplo n.º 3
0
            for title, seq, qual in FastqGeneralIterator(open(GoodFor)):
                ID = title.split(' ')[0]
                if not ID in singleFor:
                    peF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                else:
                    seF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    with open(PErev, 'w') as peR:
        with open(SErev, 'w') as seR:
            for title, seq, qual in FastqGeneralIterator(open(GoodRev)):
                ID = title.split(' ')[0]
                if not ID in singleRev:
                    peR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                else:
                    seR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    #do some counts of output and cleanup
    total = amptklib.countfastq(args.input)
    passed = amptklib.countfastq(PEfor)
    passedrev = amptklib.countfastq(PErev)
    if passed != passedrev:
        print("Error: forward reads %i != reverse reads %i" %
              (passed, passedrev))
    nopaired = len(singleFor) + len(singleRev)
    failed = len(bothfail)
    print("-------------------------------------------------------")
    print("%i total reads" % total)
    print("%i primer found properly paired: %s, %s" % (passed, PEfor, PErev))
    print("%i primer found singletons: %s, %s" % (nopaired, SEfor, SErev))
    print("%i primer not found in either forward or reverse reads" % (failed))
    amptklib.removefile(GoodFor)
    amptklib.removefile(GoodRev)
else:
Ejemplo n.º 4
0
#main start here
cpus = multiprocessing.cpu_count()
print "----------------------------------"
tmpinput = 'amptk_show.tmp'
if args.input.endswith('.gz'):
    amptklib.Funzip(args.input, tmpinput, cpus)
else:
    tmpinput = args.input
countBarcodes(tmpinput)
print "----------------------------------"
getSeqLength(tmpinput)
print "----------------------------------"
if args.quality_trim:
    #split the input FASTQ file into chunks to process
    #split fastq file
    SeqCount = amptklib.countfastq(tmpinput)
    pid = os.getpid()
    folder = 'amptk_tmp_' + str(pid)
    amptklib.split_fastq(tmpinput, SeqCount, folder, cpus*2)    
    #now get file list from tmp folder
    file_list = []
    for file in os.listdir(folder):
        if file.endswith(".fq"):
            file = os.path.join(folder, file)
            file_list.append(file)

    p = multiprocessing.Pool(cpus)
    for f in file_list:
        #worker(f)
        p.apply_async(worker, [f])
    p.close()
Ejemplo n.º 5
0
                else:
                    name = ID + ".fastq"
                continue
            Barcodes[name] = line.strip()

    #check for compressed input file
    if args.FASTQ.endswith('.gz'):
        amptklib.log.info("Gzipped input files detected, uncompressing")
        FASTQ_IN = args.FASTQ.replace('.gz', '')
        amptklib.Funzip(args.FASTQ, FASTQ_IN, multiprocessing.cpu_count())
    else:
        FASTQ_IN = args.FASTQ

    #count FASTQ records in input
    amptklib.log.info("Loading FASTQ Records")
    total = amptklib.countfastq(FASTQ_IN)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')

    #output message depending on primer requirement
    if args.require_primer == 'off':
        amptklib.log.info("Looking for %i barcodes" % (len(Barcodes)))
    elif args.require_primer == 'forward':
        amptklib.log.info(
            "Looking for %i barcodes that must have FwdPrimer: %s" %
            (len(Barcodes), FwdPrimer))
    elif args.require_primer == 'both':
        amptklib.log.info(
            "Looking for %i barcodes that must have FwdPrimer: %s and  RevPrimer: %s"
            % (len(Barcodes), FwdPrimer, RevPrimer))
Ejemplo n.º 6
0
#get utax_database
if args.db in DataBase:
    utaxDB = DataBase.get(args.db)[1]
else:
    if not args.closed_ref_only:
        if args.utax_db:
            utaxDB = os.path.abspath(args.utax_db)
        else:
            amptklib.log.error(
                "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db"
                % args.db)
            sys.exit(1)

#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
orig_total = amptklib.countfastq(args.FASTQ)
size = checkfastqsize(args.FASTQ)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step and convert to fasta
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
orig_fasta = os.path.join(tmp, args.out + '.orig.fa')
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
    str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
    '--fastq_qmax', '55'
]
amptklib.runSubprocess(cmd, amptklib.log)
Ejemplo n.º 7
0
cleanR1 = os.path.join(tmpdir, 'renamedR1.fastq')
cleanR2 = os.path.join(tmpdir, 'renamedR2.fastq')
amptklib.DemuxIllumina(args.fastq, args.reverse, args.index[0], mapdict,
                       args.barcode_mismatch, cleanR1, cleanR2)

#estimate read length
if amptklib.check_valid_file(cleanR1):
    #if read length explicity passed use it otherwise measure it
    if args.read_length:
        ReadLen = args.read_length
    else:
        ReadLen = amptklib.GuessRL(cleanR1)

#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
orig_total = amptklib.countfastq(cleanR1)
size = amptklib.checkfastqsize(cleanR1)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#now we can merge the reads
mergedReads = args.out + '.merged.fastq'
amptklib.log.info("Merging PE reads using VSEARCH and filtering for phiX")
amptklib.MergeReads(os.path.abspath(cleanR1), os.path.abspath(cleanR2), tmpdir,
                    mergedReads, ReadLen, args.min_len, args.usearch,
                    args.rescue_forward, 'vsearch', '', 1)

if not args.full_length:
    if args.pad == 'off':
        amptklib.log.info("Stripping primers and trim to %s bp" %
                          (args.trim_len))