cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tiSeqs\n') for k, v in natsorted(iSeqMapped.items()): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #strip N's ufitslib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, args.out + '.EE' + args.maxee + '.clean.fa') ufitslib.fasta_strip_padding(uclust_out, otu_clean) #run optional uchime_ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, args.out + '.EE' + args.maxee + '.uchime.otus.fa') #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') if not os.path.isfile(uchime_db): ufitslib.log.error(
tax = col[2] if any(x in tax for x in filt_tax_values): record = seqDict[ID] record.id = 'OTU' + str(otu_counter) + ';UTAX;tax=' + tax record.name = '' record.description = '' SeqIO.write(record, output, 'fasta') otu_counter += 1 total = ufitslib.countfasta(ref_clustered) - num_refcluster ufitslib.log.info('{0:,}'.format(total) + ' classified to %s' % taxonomyLookup.get(args.utax_level)) #clean up padded N's ufitslib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, args.out + '.clean.otus.fa') ufitslib.fasta_strip_padding(ref_clustered, otu_clean) total = ufitslib.countfasta(otu_clean) ufitslib.log.info('{0:,}'.format(total) + ' total OTUs') #now map reads back to OTUs uc_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta ufitslib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table
ID = col[0] tax = col[2] if any(x in tax for x in filt_tax_values): record = seqDict[ID] record.id = 'OTU'+str(otu_counter)+';UTAX;tax='+tax record.name = '' record.description = '' SeqIO.write(record, output, 'fasta') otu_counter += 1 total = ufitslib.countfasta(ref_clustered) - num_refcluster ufitslib.log.info('{0:,}'.format(total) + ' classified to %s' % taxonomyLookup.get(args.utax_level)) #clean up padded N's ufitslib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, args.out + '.clean.otus.fa') ufitslib.fasta_strip_padding(ref_clustered, otu_clean) total = ufitslib.countfasta(otu_clean) ufitslib.log.info('{0:,}'.format(total) + ' total OTUs') #now map reads back to OTUs uc_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta ufitslib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = ['vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table] ufitslib.runSubprocess(cmd, ufitslib.log)
cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tiSeqs\n') for k,v in natsorted(iSeqMapped.items()): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #strip N's ufitslib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, args.out + '.EE' + args.maxee + '.clean.fa') ufitslib.fasta_strip_padding(uclust_out, otu_clean) #run optional uchime_ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.uchime.otus.fa') #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in ['ITS', '16S', 'LSU', 'COI']: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref+'.extracted.fa') if not os.path.isfile(uchime_db): ufitslib.log.error("Database not properly configured, run `ufits install` to setup DB, skipping chimera filtering") uchime_out = otu_clean else: uchime_db = os.path.abspath(args.uchime_ref)
cmd = ["vsearch", "--sortbysize", unoise_out, "--minsize", args.minsize, "--output", sort_out] ufitslib.runSubprocess(cmd, ufitslib.log) # now run clustering algorithm radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".otus.fa") ufitslib.log.info("Clustering OTUs (UPARSE)") cmd = [usearch, "-cluster_otus", sort_out, "-relabel", "OTU", "-otu_radius_pct", radius, "-otus", otu_out] ufitslib.runSubprocess(cmd, ufitslib.log) numOTUs = ufitslib.countfasta(otu_out) ufitslib.log.info("{0:,}".format(numOTUs) + " OTUs") # clean up padded N's ufitslib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, args.out + ".EE" + args.maxee + ".clean.otus.fa") ufitslib.fasta_strip_padding(otu_out, otu_clean) # optional UCHIME Ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".uchime.otus.fa") # check if file is present, remove from previous run if it is. if os.path.isfile(uchime_out): os.remove(uchime_out) # R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ "ITS", "16S", "LSU", "COI",