Beispiel #1
0
def getAvgLength(input):
    AvgLength = []
    for title, seq, qual in FastqGeneralIterator(open(input)):
        AvgLength.append(len(seq))
    Average = sum(AvgLength) / float(len(AvgLength))
    Min = min(AvgLength)
    Max = max(AvgLength)
    a = np.array(AvgLength)
    nintyfive = np.percentile(a, 5)
    return (Average, Min, Max, int(nintyfive))


#remove logfile if exists
log_name = args.out + '.amptk-dada2.log'
if os.path.isfile(log_name):
    amptklib.removefile(log_name)

amptklib.setupLogging(log_name)
FNULL = open(os.devnull, 'w')
cmd_args = " ".join(sys.argv) + '\n'
amptklib.log.debug(cmd_args)
print "-------------------------------------------------------"
#initialize script, log system info and usearch version
amptklib.SystemInfo()
#Do a version check
usearch = args.usearch
amptklib.versionDependencyChecks(usearch)

#check dependencies
programs = ['Rscript']
amptklib.CheckDependencies(programs)
Beispiel #2
0
        lines = [line.rstrip('\n') for line in input]
    remove = remove + lines

if args.list:
    lines = args.list
    remove = remove + lines

#make sure it is a set, faster lookup
keep_list = set(remove)
count = len(keep_list)

#now run filtering
keep_count = 0
total_count = 0

#rename to base
if args.out.endswith('.gz'):
    outfile = args.out.replace('.gz', '')
else:
    outfile = args.out
#run filtering
filter_sample(SeqIn, outfile)
#compress and clean
if args.out.endswith('.gz'):  #compress in place
    amptklib.Fzip_inplace(outfile)
if args.input.endswith('.gz'):
    amptklib.removefile(SeqIn)

print("Removed %i samples" % count)
print("Kept %i reads out of %i total reads" % (keep_count, total_count))
Beispiel #3
0
    )

#create OTU phylogeny for downstream processes
amptklib.log.info("Generating phylogenetic tree")
tree_out = base + '.tree.phy'
cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out]
amptklib.runSubprocess(cmd, amptklib.log)

#print some summary file locations
amptklib.log.info("Taxonomy finished: %s" % taxFinal)
if args.otu_table and not args.method == 'blast':
    amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable)
    #output final OTU table in Biom v1.0 (i.e. json format if biom installed)
    outBiom = base + '.biom'
    if amptklib.which('biom'):
        amptklib.removefile(outBiom)
        cmd = [
            'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp',
            '--table-type', "OTU table", '--to-json'
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        if args.mapping_file:
            cmd = [
                'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom,
                '--observation-metadata-fp', qiimeTax, '-m', args.mapping_file,
                '--sc-separated', 'taxonomy', '--output-as-json'
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
        else:
            cmd = [
                'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom,
Beispiel #4
0
parser.add_argument('-o','--out', help='Base output name')
parser.add_argument('--min_reads_otu', default=2, type=int, help='Minimum number of reads per OTU for experiment')
parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE')
parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files')
args=parser.parse_args()

if not args.out:
    #get base name of files
    base = args.otu_table.split(".otu_table")
    base = base[0]
else:
    base = args.out

#remove logfile if exists
log_name = base + '.amptk-filter.log'
amptklib.removefile(log_name)

amptklib.setupLogging(log_name)
FNULL = open(os.devnull, 'w')
cmd_args = " ".join(sys.argv)+'\n'
amptklib.log.debug(cmd_args)
print "-------------------------------------------------------"

#initialize script, log system info and usearch version
amptklib.SystemInfo()
#Do a version check
usearch = args.usearch
amptklib.versionDependencyChecks(usearch)

#check if otu_table is empty
amptklib.log.info("Loading OTU table: %s" % args.otu_table)
Beispiel #5
0
    if k not in barcodes_found:
        barcodes_found.append(k)
amptklib.log.info("Found %i barcoded samples\n%s" %
                  (len(BarcodeCount), barcode_counts))

if not args.mapping_file:
    #create a generic mappingfile for downstream processes
    genericmapfile = args.out + '.mapping_file.txt'
    amptklib.CreateGenericMappingFile(barcode_file, FwdPrimer,
                                      revcomp_lib.RevComp(RevPrimer), Adapter,
                                      genericmapfile, barcodes_found)

#compress the output to save space
FinalDemux = catDemux + '.gz'
amptklib.Fzip(catDemux, FinalDemux, cpus)
amptklib.removefile(catDemux)
if gzip_list:
    for file in gzip_list:
        file = file.replace('.gz', '')
        amptklib.removefile(file)

#get file size
filesize = os.path.getsize(FinalDemux)
readablesize = amptklib.convertSize(filesize)
amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
amptklib.log.info("Mapping file: %s" % genericmapfile)

print "-------------------------------------------------------"
if 'win32' in sys.platform:
    print "\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux)
else:
Beispiel #6
0
def primer2Strip(file, GoodOut, BadOut, fwdprimer, revprimer):


#now run primer strip
if args.reverse:
    if not args.rev_primer:
        print("ERROR: if reverse reads passed you must provide -r,--rev_primer")
        sys.exit(1)
    #first run forwards
    GoodFor = args.out + '.fwd.stripped.fq'
    BadFor = args.out + '.fwd.no_primer.fq'
    primerStrip(args.input, GoodFor, BadFor, args.fwd_primer, args.rev_primer)
    #now run reverse
    GoodRev = args.out + '.rev.stripped.fq'
    BadRev = args.out + '.rev.no_primer.fq'
    primerStrip(args.reverse, GoodRev, BadRev, args.rev_primer, args.fwd_primer)
    #now get bad reads into list
    singleRev = []
    singleFor = []
    for title, seq, qual in FastqGeneralIterator(open(BadFor)):
        singleRev.append(title.split(' ')[0])
    for title, seq, qual in FastqGeneralIterator(open(BadRev)):
        singleFor.append(title.split(' ')[0])
    bothfail = sorted(set(singleRev) & set(singleFor), key = singleFor.index)

    #now get PE and singletons
    PEfor = args.out +'.pe_R1.fastq'
    PErev = args.out +'.pe_R2.fastq'
    SEfor = args.out +'.fwd.singletons.fastq'
    SErev = args.out +'.rev.singletons.fastq'
    with open(PEfor, 'w') as peF:
        with open(SEfor, 'w') as seF:
            for title, seq, qual in FastqGeneralIterator(open(GoodFor)):
                ID = title.split(' ')[0]
                if not ID in singleFor:
                    peF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                else:
                    seF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    with open(PErev, 'w') as peR:
        with open(SErev, 'w') as seR:
            for title, seq, qual in FastqGeneralIterator(open(GoodRev)):
                ID = title.split(' ')[0]
                if not ID in singleRev:
                    peR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                else:
                    seR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    #do some counts of output and cleanup
    total = amptklib.countfastq(args.input)
    passed = amptklib.countfastq(PEfor)
    passedrev = amptklib.countfastq(PErev)
    if passed != passedrev:
        print("Error: forward reads %i != reverse reads %i" % (passed, passedrev))
    nopaired = len(singleFor) + len(singleRev)
    failed = len(bothfail)
    print("%i total reads" % total)
    print("%i primer found properly paired: %s, %s" % (passed, PEfor, PErev))
    print("%i primer found singletons: %s, %s" % (nopaired, SEfor, SErev))
    print("%i primer not found in either forward or reverse reads" % (failed))    
    amptklib.removefile(GoodFor)
    amptklib.removefile(GoodRev)
else:        
    GoodOut = args.out +'.stripped.fq'
    BadOut = args.out +'.no_primer_found.fq'
    primerStrip(args.input, GoodOut, BadOut, args.fwd_primer, False)
    total = amptklib.countfastq(args.input)
    passed = amptklib.countfastq(GoodOut)
    failed = amptklib.countfastq(BadOut)
    print("%i total reads" % total)
    print("%i primer found/stripped: %s" % (passed, GoodOut))
    print("%i primer not found: %s" % (failed, BadOut))
Beispiel #7
0
                    seR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    #do some counts of output and cleanup
    total = amptklib.countfastq(args.input)
    passed = amptklib.countfastq(PEfor)
    passedrev = amptklib.countfastq(PErev)
    if passed != passedrev:
        print("Error: forward reads %i != reverse reads %i" %
              (passed, passedrev))
    nopaired = len(singleFor) + len(singleRev)
    failed = len(bothfail)
    print("-------------------------------------------------------")
    print("%i total reads" % total)
    print("%i primer found properly paired: %s, %s" % (passed, PEfor, PErev))
    print("%i primer found singletons: %s, %s" % (nopaired, SEfor, SErev))
    print("%i primer not found in either forward or reverse reads" % (failed))
    amptklib.removefile(GoodFor)
    amptklib.removefile(GoodRev)
else:
    #setup tmpdir
    tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    GoodOut = args.out + '.stripped.fq'
    BadOut = args.out + '.no_primer_found.fq'
    filelist = splitter(args.input, tmpdir)
    amptklib.runMultiProgress(primerStrip, filelist, cpus)
    combiner(tmpdir, ".good", GoodOut)
    combiner(tmpdir, ".bad", BadOut)
    shutil.rmtree(tmpdir)
    total = amptklib.countfastq(args.input)
Beispiel #8
0
amptklib.runSubprocess(cmd, amptklib.log)

#count OTUs
otu_count = amptklib.countfasta(newOTUs)
amptklib.log.info('{0:,}'.format(otu_count) + ' OTUs remaining')

#count reads mapped
total = amptklib.line_count(uc_out)
orig_total = amptklib.countfasta(tmpReads)
amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                  '({0:.0f}%)'.format(total / float(orig_total) * 100))

#Print location of files to STDOUT
print "-------------------------------------------------------"
print "Clustered OTUs: %s" % newOTUs
print "OTU Table: %s" % newTable
print "-------------------------------------------------------"

#cleanup
amptklib.removefile(tmpReads)
amptklib.removefile(uc_out)

otu_print = newOTUs.split('/')[-1]
tab_print = newTable.split('/')[-1]
if 'win32' in sys.platform:
    print "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (
        tab_print, otu_print)
else:
    print colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (
        tab_print, otu_print)
Beispiel #9
0
    for file in os.listdir(folder):
        if file.endswith(".fq"):
            file = os.path.join(folder, file)
            file_list.append(file)

    p = multiprocessing.Pool(cpus)
    for f in file_list:
        #worker(f)
        p.apply_async(worker, [f])
    p.close()
    p.join()

    #get filtered results
    catDemux = args.out
    with open(catDemux, 'w') as outfile:
        for filename in glob.glob(os.path.join(folder,'*.filter.fq')):
            if filename == catDemux:
                continue
            with open(filename, 'rU') as readfile:
                shutil.copyfileobj(readfile, outfile)
    if catDemux.endswith('.gz'):
        amptklib.Fzip_inplace(catDemux)
    shutil.rmtree(folder)
    print "----------------------------------"
    countBarcodes(args.out)
    print "----------------------------------"
    print "Script finished, output in %s" % args.out

if args.input.endswith('.gz'):
    amptklib.removefile(tmpinput)
Beispiel #10
0
    amptklib.log.info("Now Gzipping files")
    for file in os.listdir(args.out):
        if file.endswith(".fastq"):
            file_path = os.path.join(args.out, file)
            amptklib.Fzip_inplace(file_path)

    #after all files demuxed into output folder, loop through and create SRA metadata file
    filelist = []
    for file in os.listdir(args.out):
        if file.endswith(".fastq.gz"):
            filelist.append(file)

amptklib.log.info("Finished: output in %s" % args.out)
#clean up if gzipped
if args.FASTQ.endswith('.gz'):
    amptklib.removefile(FASTQ_IN)

#check for BioSample meta file
if args.biosample:
    amptklib.log.info(
        "NCBI BioSample file detected, creating SRA metadata file")
    #load in BioSample file to dictionary
    with open(args.biosample, 'rU') as input:
        reader = csv.reader(input, delimiter='\t')
        header = next(reader)
        acc = header.index('Accession')
        sample = header.index('Sample Name')
        bio = header.index('BioProject')
        try:
            host = header.index('Host')
        except ValueError:
        ID = line.split("=", 1)[-1].split(";")[0]
        if ID not in BarcodeCount:
            BarcodeCount[ID] = 1
        else:
            BarcodeCount[ID] += 1

#now let's count the barcodes found and count the number of times they are found.
barcode_counts = "%30s:  %s" % ('Sample', 'Count')
for k, v in natsorted(BarcodeCount.items(), key=lambda (k, v): v,
                      reverse=True):
    barcode_counts += "\n%30s:  %s" % (k, str(BarcodeCount[k]))
amptklib.log.info("Found %i barcoded samples\n%s" %
                  (len(BarcodeCount), barcode_counts))

#compress the output to save space
FinalDemux = Demux + '.gz'
amptklib.Fzip(Demux, FinalDemux, cpus)
amptklib.removefile(Demux)

#get file size
filesize = os.path.getsize(FinalDemux)
readablesize = amptklib.convertSize(filesize)
amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
amptklib.log.info("Mapping file: %s" % args.mapping_file)
print "-------------------------------------------------------"
if 'win32' in sys.platform:
    print "\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux)
else:
    print col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (
        FinalDemux)