Example #1
0
    Barcodes = {}
    with open(barcode_file, 'rU') as input:
        for line in input:
            if line.startswith('>'):
                if args.names:
                    name = namesDict.get(line[1:-1])
                    name = name + ".fastq"
                else:
                    name = line[1:-1] + ".fastq"
                continue
            Barcodes[name] = line.strip()

    #count FASTQ records in input
    amptklib.log.info("Loading FASTQ Records")
    total = amptklib.countfastq(args.FASTQ)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')

    #output message depending on primer requirement
    if args.require_primer == 'off':
        amptklib.log.info("Looking for %i barcodes" % (len(Barcodes)))
    elif args.require_primer == 'forward':
        amptklib.log.info(
            "Looking for %i barcodes that must have FwdPrimer: %s" %
            (len(Barcodes), FwdPrimer))
    elif args.require_primer == 'both':
        amptklib.log.info(
            "Looking for %i barcodes that must have FwdPrimer: %s and  RevPrimer: %s"
            % (len(Barcodes), FwdPrimer, RevPrimer))
Example #2
0
Rversions = amptklib.checkRversion()
R_pass = '******'
dada2_pass = '******'
#check dada2 first, if good move on, otherwise issue warning
if not amptklib.gvc(Rversions[1], dada2_pass):
    amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" %
                       (Rversions[0], Rversions[1], dada2_pass))
    amptklib.log.error(
        "See: http://benjjneb.github.io/dada2/dada-installation.html")
    sys.exit(1)
amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

#Count FASTQ records and remove 3' N's as dada2 can't handle them
amptklib.log.info("Loading FASTQ Records")
orig_total = amptklib.countfastq(args.fastq)
size = amptklib.checkfastqsize(args.fastq)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')
no_ns = args.out + '.cleaned_input.fq'
amptklib.fastq_strip_padding(args.fastq, no_ns)

#quality filter
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
derep = args.out + '.qual-filtered.fq'
filtercmd = [
    'vsearch', '--fastq_filter', no_ns, '--fastq_maxee',
    str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55',
    '--fastq_maxns', '0'
]
amptklib.runSubprocess(filtercmd, amptklib.log)
total = amptklib.countfastq(derep)
Example #3
0
#check dada2 first, if good move on, otherwise issue warning
if not amptklib.gvc(Rversions[1], dada2_pass):
    amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass))
    amptklib.log.error("See: http://benjjneb.github.io/dada2/dada-installation.html")
    sys.exit(1)
amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

#Count FASTQ records and remove 3' N's as dada2 can't handle them
amptklib.log.info("Loading FASTQ Records")
no_ns = args.out+'.cleaned_input.fq'
amptklib.fastq_strip_padding(args.fastq, no_ns)
demuxtmp = args.out+'.original.fa'
cmd = ['vsearch', '--fastq_filter', os.path.abspath(no_ns),'--fastq_qmax', '55', '--fastaout', demuxtmp]
amptklib.runSubprocess(cmd, amptklib.log)
orig_total = amptklib.countfasta(demuxtmp)
size = amptklib.checkfastqsize(no_ns)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#quality filter
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
derep = args.out+'.qual-filtered.fq'
filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0']
amptklib.runSubprocess(filtercmd, amptklib.log)
total = amptklib.countfastq(derep)
amptklib.log.info('{0:,}'.format(total) + ' reads passed')

#split into individual files
amptklib.log.info("Splitting FASTQ file by Sample into individual files")
filtfolder = args.out+'_filtered'
if os.path.isdir(filtfolder):
Example #4
0
    #parse and put into dictionary
    with open(rev_barcode_file, 'w') as output:
        with open(args.reverse_barcode, 'rU') as input:
            for rec in SeqIO.parse(input, 'fasta'):
                RevSeq = str(rec.seq.reverse_complement())
                if not rec.id in RevBarcodes:
                    RevBarcodes[rec.id] = RevSeq
                    output.write('>%s\n%s\n' % (rec.id, RevSeq))
                else:
                    amptklib.log.error(
                        "Duplicate reverse barcodes detected, exiting")
                    sys.exit(1)
#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
orig_total = amptklib.countfastq(SeqIn)
size = amptklib.checkfastqsize(SeqIn)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#create tmpdir and split input into n cpus
tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
if not os.path.exists(tmpdir):
    os.makedirs(tmpdir)

#split fastq file
amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2)

#now get file list from tmp folder
file_list = []
for file in os.listdir(tmpdir):
    if file.endswith(".fq"):
cleanR2 = os.path.join(tmpdir, 'renamedR2.fastq')
amptklib.DemuxIllumina(args.fastq, args.reverse, args.index[0], mapdict,
                       args.barcode_mismatch, cleanR1, cleanR2)

#estimate read length
if amptklib.check_valid_file(cleanR1):
    #if read length explicity passed use it otherwise measure it
    if args.read_length:
        ReadLen = args.read_length
    else:
        ReadLen = amptklib.GuessRL(cleanR1)

#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
orig_total = amptklib.countfastq(cleanR1)
size = amptklib.checkfastqsize(cleanR1)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#now we can merge the reads
mergedReads = args.out + '.merged.fastq'
amptklib.log.info("Merging PE reads using VSEARCH and filtering for phiX")
amptklib.MergeReads(os.path.abspath(cleanR1), os.path.abspath(cleanR2), tmpdir,
                    mergedReads, ReadLen, args.min_len, args.usearch,
                    args.rescue_forward, 'vsearch', '', 1)

if not args.full_length:
    if args.pad == 'off':
        amptklib.log.info("Stripping primers and trim to %s bp" %
                          (args.trim_len))
    else: