def __init__(self, name, minFilter=True, fastafile=None, qualfile=None, otus=None): Tree.__init__(self, Node(name="root", nodeID=1), name=name) self.bsr = BSR_DEFAULT self.ms = MS_DEFAULT self.minFilter = minFilter self.noHits = Node("No hits", parent=self.root) self.addNode(self.noHits) self.seqs = {} self.qual = {} self.read_node_assignments = {} if fastafile: fstream = open(fastafile, 'r') for seq_record in SeqIO.parse(fstream, "fasta"): self.seqs[seq_record.id] = seq_record.seq if qualfile: qstream = open(qualfile, 'r') for q_record in SeqIO.parse(qstream, "qual"): self.qual[q_record.id] = q_record
def __init__(self, name=None): Tree.__init__(self, Node(name="root", nodeID=1), name) self.cellOrg = Node(name="Cellular organisms", parent=self.root, nodeID=2) self.addNode(self.cellOrg) self.idCount = 10000000 self.problemNodes = [] self.rejected = [] self.phylumNames = [] self.classNames = [] self.orderNames = [] self.familyNames = [] self.genusNames = [] try: rankDir = (os.environ['LCATaxonomyDir'] + "/ranks/") ncbiRanks = {rankDir + "all_ncbi_phylum": self.phylumNames, rankDir + "all_ncbi_class": self.classNames, rankDir + "all_ncbi_order": self.orderNames, rankDir + "all_ncbi_family": self.familyNames, rankDir + "all_ncbi_genus": self.genusNames} for filename in ncbiRanks.keys(): readFile = open(filename, 'r') for line in readFile: ncbiRanks[filename].append(line[:-1]) readFile.close() except: pass
def parseQIIME(self, outFile, confidence=0.5, weighFlows=True): outFile = open(outFile, 'r') i = 0 #Format: #IO95TX02.787F-MID-1_s60_c01_T400_s30_c08_2_3 #Bacteria;Proteobacteria;Deltaproteobacteria;Desulfobulbaceae 0.82 for line in outFile: #Add read #print line sl = line.split("\t") rName = sl[0] taxa = sl[1].split(";") conf = sl[2] read = RDPRead(name=rName, reverse=None) node = parent = self.root levels = len(taxa) for i in range(levels): nodeName = taxa[i].replace("\"", "") p = parent while p is not self.root: if nodeName == p.name: nodeName += " (%s)" % p.name break else: p = p.parent if conf >= confidence: node = self.getNode(nodeName) if not node: node = Node(parent=parent, name=nodeName, depth=i) self.addNode(node) parent = node node.assignRead(read, primary=True, recursive=True)
def __init__(self, name, minFilter=True, fastafile=None, qualfile=None, otus=None): Tree.__init__(self, Node(name="root", nodeID=1), name=name) self.bsr = BSR_DEFAULT self.ms = MS_DEFAULT self.minFilter = minFilter self.noHits = Node("No hits", parent=self.root) self.addNode(self.noHits) self.seqs = {} self.qual = {} self.read_node_assignments={} if fastafile: fstream = open(fastafile, 'r') for seq_record in SeqIO.parse(fstream, "fasta"): self.seqs[seq_record.id] = seq_record.seq if qualfile: qstream = open(qualfile, 'r') for q_record in SeqIO.parse(qstream, "qual"): self.qual[q_record.id] = q_record
def processChangesMetadata(self, line): """Process one line of metadata update in tab-sep format""" try: parts = line.split(";") #process parts oldName = parts[0] newName = parts[1] newParent = parts[2] rank = parts[3] except: sys.stderr.write("Warning: Incorrect manual change line: %s\n" % line) return parent = self.getNode(newParent) if newParent and not parent: sys.stderr.write("Cannot find parent node %s\n" % newParent) return #___Fix depth___ depth = 0 #if rank is specified: use this if rank: for d in ARBor.depths.keys(): if ARBor.depths[d] == rank: depth = d break elif parent: depth = parent.depth + 1 #____ # Add new node if not oldName: if self.getNode(newName): print "Already present: %s" % newName else: if not parent: print ("Warning: Cannot add taxon %s as parent %s " "does not exist" % (newName, newParent)) else: newNode = Node(name=newName, parent=parent, nodeID=self.newID(), depth=depth) self.addNode(newNode) print "Added new taxon: %s" % newName # Remove node elif not newName and not newParent: if not self.getNode(oldName): print "Already deleted: %s" % oldName else: self.deleteNode(self.getNode(oldName), False) print "Deleted taxon: %s" % oldName # Move or rename taxon elif not ".." in newName: n = self.getNode(oldName) if not self.getNode(oldName): if self.getNode(newName): print "Already moved / renamed: %s" % oldName else: sys.stderr.write("Cannot find node: %s\n" % oldName) else: if newName: self.renameNode(n, newName) print "Renamed %s to %s" % (oldName, newName) if newParent and not (newParent == n.parent.name): print ("Moving %s from %s to %s" % (oldName, n.parent.name, newParent)) self.moveNode(n, self.getNode(newParent)) n.depth = depth #Control shorthand annotation with .. else: twoNodes = oldName.split("..") firstParentName = self.getNode(twoNodes[0]).parent.name secParentName = self.getNode(twoNodes[1]).parent.name if not (firstParentName == parent and secParentName == parent): sys.stderr.write("Warning: Taxons %s not moved properly in " "NDS file!!\n" % oldName)
def _readNDSLine(self, line, eukaryotic=True, altTax=True, GGMode=True): parts = line.split("\t") accession = parts[0] if "." in accession: accession = accession[:accession.find(".")] taxonomy = parts[1] ncbi_name = parts[2] plast = False mito = False if (eukaryotic and len(parts) > 3): # or (GGMode and "Chloroplast" in taxonomy) if "Chloroplast" in parts[3]: plast = True elif "mitochondria" in parts[3].lower(): mito = True parent = self.cellOrg depth = Tree.META if len(parts) < 1: print "Problem:\n%s" % line return if GGMode: taxonomy = taxonomy.replace("; ", ";") taxa = re.split('[/;]', taxonomy) else: taxa = re.split('[_/;]', taxonomy) ncbi_name = ncbi_name.replace("\n", "") # Do not use eukaryotic reads in non-eukaryotic mode if not eukaryotic and not GGMode: if taxa[0] == "Eukarya" or ("Chloroplast" in taxa) \ or ("mitochondria" in taxa): self.rejected.append(accession) return # Ignore uncultured groups and handle like clustered to parent. if taxa[-1] == "uncultured": taxa = taxa[:-1] # Alt tax. fix if eukaryotic and altTax: if accession in self.rejected: del self.rejected[self.rejected.index(accession)] alt = ["Eukaryota"] if plast or (" plastid" in ncbi_name.lower()) or \ ("chloroplast" in ncbi_name.lower()): alt.append("Plastid") plast = True elif mito or ("mitochondrion" in ncbi_name.lower()): alt.append("Mitochondrion") mito = True else: alt.append("Nucleus") #Put extra labels on all childs of these new groups extra = "" if mito: extra = " (Mitochondrion)" elif plast: extra = " (Plastid)" for taxon in taxa[1:]: if len(taxon) > 0: alt.append(taxon + extra) taxa = alt #Use NCBI Taxonomy species name if meaningful species = True for key in ARBor.nonSpeciesKeys: if key in ncbi_name: species = False break if species: if eukaryotic: if (plast or mito or (" plastid" in ncbi_name.lower()) or (" mitochondrion" in ncbi_name.lower())): name_only = ncbi_name.replace(" Plastid", "") name_only = name_only.replace(" plastid", "") name_only = name_only.replace(" Mitochondrion", "") name_only = name_only.replace(" mitochondrion", "") else: name_only = ncbi_name ncbi_name = name_only + " nucleus" if not altTax: taxa.append(name_only) taxa.append(ncbi_name) # --- Remove line to use synonyms file instead taxa.append(accession) # Makes synoynms file redundant. # --- else: #Not species taxa.append(accession) # Remove line to use synonyms file instead i = 0 parents = [] for taxon in taxa: # If we are in GGMode then there will be a species name and accession #that should not be touched: if (GGMode and len(taxon) > 3 and not ((taxon is taxa[-1]) or (taxon is taxa[-2] and species))): GGfix = True taxon = taxon[3:] dsym = taxon[0] else: GGfix = False #Check if taxon is numerical or empty if len(re.findall('[0-9]', taxon)) == len(taxon) and not GGMode: pass else: depth += 1 #Fix parent of self ambigousity error if taxon in parents: if depth > Tree.SPECIES or taxa[0] == 'Eukarya': taxon += " (group)" else: taxon += " (%s)" % Tree.depths[depth] #Fix incertae sedis only issues (still in 106) if taxon == "Incertae Sedis" or taxon == "Incertae_sedis": taxon = "%s Incertae Sedis" % parent.name #Find or create node node = self.getNode(taxon) if node: # Fix parent conflicts if node.parent is not parent: if node not in self.problemNodes: sys.stderr.write("Warning: Conflicting placement " "of node %s" % taxon) sys.stderr.write("( parent: %s) Parent of " "existing: %s\n\n" % (node.parent, parent)) self.problemNodes.append(node) # Two accession number nodes with different taxonomy # not allowed. Skip these. if taxon == accession: return # Otherwise try to find different name altname = "%s (%s)" % (taxon, parent.name) node = self.getNode(altname) alt = 2 # Backup plan if we are still not ok while node and not node.parent is parent: altname = "%s (%s)" % (taxon, alt) alt += 1 node = self.getNode(altname) # Add new node with alternative name. if not node: node = Node(name=altname, parent=parent, nodeID=self.newID(), depth=depth) self.addNode(node) else: depth = node.getHighestRank() else: # Fix depth issues rank = depth if (taxon == "Plastid" or taxon == "Nucleus" or taxon == "Mitochondrion"): rank = Tree.NORANK depth = Tree.DOMAIN #Find depth elif taxon is taxa[-1] and depth < Tree.SPECIES: if not species: rank = Tree.SPECIES else: rank = Tree.SUBSPECIES elif (taxon is taxa[-2] and species and depth < Tree.SPECIES): rank = Tree.SPECIES elif taxon is taxa[0]: depth = rank = Tree.DOMAIN elif GGfix: if dsym == "k": rank = Tree.DOMAIN elif dsym == "p": rank = Tree.PHYLUM elif dsym == "c": rank = Tree.CLASS elif dsym == "o": rank = Tree.ORDER elif dsym == "f": rank = Tree.FAMILY elif dsym == "g": rank = Tree.GENUS elif dsym =="s": rank = Tree.SPECIES elif taxon in self.phylumNames: depth = rank = Tree.PHYLUM elif taxon in self.classNames: depth = rank = Tree.CLASS elif taxon in self.orderNames or taxon[-4:] == "ales": depth = rank = Tree.ORDER elif taxon in self.familyNames or taxon[-4:] == "ceae": depth = rank = Tree.FAMILY elif taxon in self.genusNames: depth = rank = Tree.GENUS p = parent if not GGMode: while (rank > Tree.META and p.depth and p.depth >= rank): p.depth = Tree.NORANK if p.parent: p = p.parent #Add node node = Node(name=taxon, parent=parent, nodeID=self.newID(), depth=rank) self.addNode(node) parent = node parents.append(node.name) if taxon is taxa[-1]: #Associate accession with taxa node.assignRead(Read(accession)) i += 1
class LCAClassifier(Tree): """A classifier instance inherits from the standard Tree and assigns reads to it representing the sequence reads of the classified dataset""" def __init__(self, name, minFilter=True, fastafile=None, qualfile=None, otus=None): Tree.__init__(self, Node(name="root", nodeID=1), name=name) self.bsr = BSR_DEFAULT self.ms = MS_DEFAULT self.minFilter = minFilter self.noHits = Node("No hits", parent=self.root) self.addNode(self.noHits) self.seqs = {} self.qual = {} self.read_node_assignments={} if fastafile: fstream = open(fastafile, 'r') for seq_record in SeqIO.parse(fstream, "fasta"): self.seqs[seq_record.id] = seq_record.seq if qualfile: qstream = open(qualfile, 'r') for q_record in SeqIO.parse(qstream, "qual"): self.qual[q_record.id] = q_record #TODO Process tab- or comma-separated otu-file def assign(self, records, datasets=None, abundances=None, verbose=False, euk_filter=False): """Accepts a of biopython blast iterator and carries out LCA assignments to a given dataset. If abundances given, it must have same order for all lists as the datasets list""" for record in records: qName = record.query.split(" ")[0] read_abundances={} #Determine read population from from otus if given if abundances: try: i=0 if qName in abundances.keys(): seq_abundances=abundances[qName] else: qFix = qName[:qName.find("_")] seq_abundances=abundances[qFix] for ds in datasets: read_abundances[ds] = seq_abundances[i] i+=1 except: print "Warning: Cannot find %s in OTU table!" %qName #Else determine read population from its name / annotation. else: if "_" in qName: try: readPopulation = int(qName.split("_")[-1].replace(".00", "")) except: readPopulation = 1 elif "numreads=" in qName: readPopulation = int(qName[qName.find("numreads=") + len("numreads="):]) elif "size=" in qName: readPopulation = int(qName[qName.find("size=") + len("size="):-1]) #Or set to 1, if we cannot find else: readPopulation = 1 if datasets: for ds in datasets: read_abundances[ds]=readPopulation else: ra=readPopulation # Check for minimum score and any alignemnts if (record.alignments and read_abundances and record.alignments[0].hsps[0].bits >= self.ms): best_hsp = record.alignments[0].hsps[0] topScore = best_hsp.bits if self.seqs and qName in self.seqs.keys(): qSeq = self.seqs[qName] else: qSeq = str(best_hsp.query).replace("-", "") hitname = record.alignments[0].hit_def.split()[0] node = self.getNode(hitname) if not node: sys.stderr.write("Best-scoring node %s not found!\n" % hitname) sys.stderr.write("Cannot assign read %s\n" % qName) else: parents = node.getPhylogeny()[1:] # Iterate through rest of hits until falling below treshold for a in record.alignments[1:]: if a.hsps[0].bits < float(topScore) * self.bsr: break hitname = a.hit_def.split()[0] n = self.getNode(hitname) if not n: sys.stderr.write("Node " + hitname + " not found! Ignoring.\n") else: p = n.parent # iterate through parents until found in the # parents list while p not in parents: p = p.parent parents = parents[parents.index(p):] # Take a look at similarity, print info if verbose and # kick up if filter hsp_sim = (float(best_hsp.identities) / float(best_hsp.align_length)) if verbose and hsp_sim >= .99: print ("Read %s is %s percent similar to %s" % (qName, hsp_sim * 100, record.alignments[0].hit_def)) if self.minFilter: maxRankLimit = Tree.SPECIES maxRank = maxRankLimit d = maxRankLimit ranks = sfLimits.keys() ranks.sort() ranks.reverse() for rank in ranks: if hsp_sim < sfLimits[rank]: maxRank = rank - 1 else: break while (maxRank < Tree.SPECIES and maxRank < parents[0].getHighestRank()): d = min(parents[0].getHighestRank(), maxRankLimit) if verbose: print ("Read %s cannot be assigned to " "rank %s (similarity=%s)" % (qName, Tree.depths[d], hsp_sim)) parents = parents[1:] if d < maxRankLimit: novelName = ("Unknown %s %s" % (parents[0].name, Tree.depths[d])) nn = self.getNode(novelName) if nn: novelNode = nn else: depth = parents[0].getHighestRank() + 1 novelNode = Node(novelName, parent=parents[0], depth=depth) self.addNode(novelNode) parents = [novelNode] + parents # Handle assignment read = Read(qName, seq=qSeq) if euk_filter and self.getNode("Eukaryota") in parents: parents = [self.noHits] if datasets: for ds in datasets: ra=read_abundances[ds] if ra>0: parents[0].assignRead(read, dataset=ds, abundance=ra, primary=True, recursive=True) else: parents[0].assignRead(read, dataset=None, abundance=ra, primary=True, recursive=True) self.read_node_assignments[qName] = parents[0] #Below min. score elif read_abundances: #No hits if self.seqs and qName in self.seqs.keys(): qSeq = self.seqs[qName] elif record.alignments: qSeq = record.alignments[0].hsps[0].query.replace("-", "") else: qSeq = None nhr = Read(name=qName, seq=qSeq) if datasets: for ds in datasets: ra=read_abundances[ds] if ra>0: self.noHits.assignRead(nhr, dataset=ds, abundance=ra, primary=True) self.root.assignRead(nhr, dataset=ds, abundance=ra, primary=False) else: self.noHits.assignRead(nhr, dataset=ds, abundance=ra, primary=True) self.root.assignRead(nhr, dataset=ds, abundance=ra, primary=False) self.read_node_assignments[qName] = self.noHits def setBitscoreRange(self, percent): self.bsr = 1 - float(percent) / 100 def setMinScore(self, minScore): self.ms = minScore def printAssignmentsRDPQual(self, node, dataset=None, printFile=None, newTabStyle=False): assignments = node.getAssignment(dataset) if assignments and assignments.primReads: for r in assignments.primReads: toPrint = (">%s\t%s\n" % (r.name, node.getPhylogenyRDPStyle( root=False, newTabStyle=newTabStyle))) for line in self.qual[str(r.name)].format("qual").split("\n")[1:]: toPrint+=(line + " ") if printFile: printFile.write(toPrint + "\n") else: print toPrint if assignments: for child in node.children: self.printAssignmentsRDPQual(node=child, dataset=dataset, printFile=printFile, newTabStyle=newTabStyle) def writeBIOMTable(self, bTable,biomOut): taxonomy_dict = {} for obs in bTable.ObservationIds: try: aNode = self.read_node_assignments[obs] name_list = aNode.getPhylogenyNameList() taxonomy_dict[obs] = {"taxonomy":name_list} except: sys.stderr.write("Problem with assignment of",obs,": classification not found!") taxonomy_dict[obs] = {"taxonomy":self.noHits.getPhylogenyNameList()} bTable.addObservationMetadata(taxonomy_dict) biomOut.write(bTable.getBiomFormatJsonString("CREST")) def printAllSequences(self, fastaOut): for seqName, node in self.read_node_assignments.iteritems(): try: sn = self.seqs[seqName] except: sn = self.seqs[seqName[:seqName.find("_")]] fastaOut.write(">%s %s\n%s\n" % (seqName, node.getPhylogenyRDPStyle(root=False), sn)) def writeOTUsWithAssignments(self, otusOut, abundances, datasets, sep="\t"): otusOut.write("OTU"+sep) for ds in datasets: otusOut.write(ds+sep) otusOut.write("classification\n") for seqName in abundances.keys(): otusOut.write(seqName) for ab in abundances[seqName]: otusOut.write("%s%i" % (sep, ab)) taxonomy = self.read_node_assignments[seqName].getPhylogenyRDPStyle(root=False) otusOut.write("%s%s\n" % (sep, taxonomy))
def assign(self, records, datasets=None, abundances=None, verbose=False, euk_filter=False): """Accepts a of biopython blast iterator and carries out LCA assignments to a given dataset. If abundances given, it must have same order for all lists as the datasets list""" for record in records: qName = record.query.split(" ")[0] read_abundances = {} #Determine read population from from otus if given if abundances: try: i = 0 if qName in abundances.keys(): seq_abundances = abundances[qName] else: qFix = qName[:qName.find("_")] seq_abundances = abundances[qFix] for ds in datasets: read_abundances[ds] = seq_abundances[i] i += 1 except: print "Warning: Cannot find %s in OTU table!" % qName #Else determine read population from its name / annotation. else: if "_" in qName: try: readPopulation = int( qName.split("_")[-1].replace(".00", "")) except: readPopulation = 1 elif "numreads=" in qName: readPopulation = int(qName[qName.find("numreads=") + len("numreads="):]) elif "size=" in qName: readPopulation = int(qName[qName.find("size=") + len("size="):-1]) #Or set to 1, if we cannot find else: readPopulation = 1 if datasets: for ds in datasets: read_abundances[ds] = readPopulation else: ra = readPopulation # Check for minimum score and any alignemnts if (record.alignments and read_abundances and record.alignments[0].hsps[0].bits >= self.ms): best_hsp = record.alignments[0].hsps[0] topScore = best_hsp.bits if self.seqs and qName in self.seqs.keys(): qSeq = self.seqs[qName] else: qSeq = str(best_hsp.query).replace("-", "") hitname = record.alignments[0].hit_def.split()[0] node = self.getNode(hitname) if not node: sys.stderr.write("Best-scoring node %s not found!\n" % hitname) sys.stderr.write("Cannot assign read %s\n" % qName) else: parents = node.getPhylogeny()[1:] # Iterate through rest of hits until falling below treshold for a in record.alignments[1:]: if a.hsps[0].bits < float(topScore) * self.bsr: break hitname = a.hit_def.split()[0] n = self.getNode(hitname) if not n: sys.stderr.write("Node " + hitname + " not found! Ignoring.\n") else: p = n.parent # iterate through parents until found in the # parents list while p not in parents: p = p.parent parents = parents[parents.index(p):] # Take a look at similarity, print info if verbose and # kick up if filter hsp_sim = (float(best_hsp.identities) / float(best_hsp.align_length)) if verbose and hsp_sim >= .99: print("Read %s is %s percent similar to %s" % (qName, hsp_sim * 100, record.alignments[0].hit_def)) if self.minFilter: maxRankLimit = Tree.SPECIES maxRank = maxRankLimit d = maxRankLimit ranks = sfLimits.keys() ranks.sort() ranks.reverse() for rank in ranks: if hsp_sim < sfLimits[rank]: maxRank = rank - 1 else: break while (maxRank < Tree.SPECIES and maxRank < parents[0].getHighestRank()): d = min(parents[0].getHighestRank(), maxRankLimit) if verbose: print( "Read %s cannot be assigned to " "rank %s (similarity=%s)" % (qName, Tree.depths[d], hsp_sim)) parents = parents[1:] if d < maxRankLimit: novelName = ("Unknown %s %s" % (parents[0].name, Tree.depths[d])) nn = self.getNode(novelName) if nn: novelNode = nn else: depth = parents[0].getHighestRank() + 1 novelNode = Node(novelName, parent=parents[0], depth=depth) self.addNode(novelNode) parents = [novelNode] + parents # Handle assignment read = Read(qName, seq=qSeq) if euk_filter and self.getNode("Eukaryota") in parents: parents = [self.noHits] if datasets: for ds in datasets: ra = read_abundances[ds] if ra > 0: parents[0].assignRead(read, dataset=ds, abundance=ra, primary=True, recursive=True) else: parents[0].assignRead(read, dataset=None, abundance=ra, primary=True, recursive=True) self.read_node_assignments[qName] = parents[0] #Below min. score elif read_abundances: #No hits if self.seqs and qName in self.seqs.keys(): qSeq = self.seqs[qName] elif record.alignments: qSeq = record.alignments[0].hsps[0].query.replace("-", "") else: qSeq = None nhr = Read(name=qName, seq=qSeq) if datasets: for ds in datasets: ra = read_abundances[ds] if ra > 0: self.noHits.assignRead(nhr, dataset=ds, abundance=ra, primary=True) self.root.assignRead(nhr, dataset=ds, abundance=ra, primary=False) else: self.noHits.assignRead(nhr, dataset=ds, abundance=ra, primary=True) self.root.assignRead(nhr, dataset=ds, abundance=ra, primary=False) self.read_node_assignments[qName] = self.noHits
class LCAClassifier(Tree): """A classifier instance inherits from the standard Tree and assigns reads to it representing the sequence reads of the classified dataset""" def __init__(self, name, minFilter=True, fastafile=None, qualfile=None, otus=None): Tree.__init__(self, Node(name="root", nodeID=1), name=name) self.bsr = BSR_DEFAULT self.ms = MS_DEFAULT self.minFilter = minFilter self.noHits = Node("No hits", parent=self.root) self.addNode(self.noHits) self.seqs = {} self.qual = {} self.read_node_assignments = {} if fastafile: fstream = open(fastafile, 'r') for seq_record in SeqIO.parse(fstream, "fasta"): self.seqs[seq_record.id] = seq_record.seq if qualfile: qstream = open(qualfile, 'r') for q_record in SeqIO.parse(qstream, "qual"): self.qual[q_record.id] = q_record #TODO Process tab- or comma-separated otu-file def assign(self, records, datasets=None, abundances=None, verbose=False, euk_filter=False): """Accepts a of biopython blast iterator and carries out LCA assignments to a given dataset. If abundances given, it must have same order for all lists as the datasets list""" for record in records: qName = record.query.split(" ")[0] read_abundances = {} #Determine read population from from otus if given if abundances: try: i = 0 if qName in abundances.keys(): seq_abundances = abundances[qName] else: qFix = qName[:qName.find("_")] seq_abundances = abundances[qFix] for ds in datasets: read_abundances[ds] = seq_abundances[i] i += 1 except: print "Warning: Cannot find %s in OTU table!" % qName #Else determine read population from its name / annotation. else: if "_" in qName: try: readPopulation = int( qName.split("_")[-1].replace(".00", "")) except: readPopulation = 1 elif "numreads=" in qName: readPopulation = int(qName[qName.find("numreads=") + len("numreads="):]) elif "size=" in qName: readPopulation = int(qName[qName.find("size=") + len("size="):-1]) #Or set to 1, if we cannot find else: readPopulation = 1 if datasets: for ds in datasets: read_abundances[ds] = readPopulation else: ra = readPopulation # Check for minimum score and any alignemnts if (record.alignments and read_abundances and record.alignments[0].hsps[0].bits >= self.ms): best_hsp = record.alignments[0].hsps[0] topScore = best_hsp.bits if self.seqs and qName in self.seqs.keys(): qSeq = self.seqs[qName] else: qSeq = str(best_hsp.query).replace("-", "") hitname = record.alignments[0].hit_def.split()[0] node = self.getNode(hitname) if not node: sys.stderr.write("Best-scoring node %s not found!\n" % hitname) sys.stderr.write("Cannot assign read %s\n" % qName) else: parents = node.getPhylogeny()[1:] # Iterate through rest of hits until falling below treshold for a in record.alignments[1:]: if a.hsps[0].bits < float(topScore) * self.bsr: break hitname = a.hit_def.split()[0] n = self.getNode(hitname) if not n: sys.stderr.write("Node " + hitname + " not found! Ignoring.\n") else: p = n.parent # iterate through parents until found in the # parents list while p not in parents: p = p.parent parents = parents[parents.index(p):] # Take a look at similarity, print info if verbose and # kick up if filter hsp_sim = (float(best_hsp.identities) / float(best_hsp.align_length)) if verbose and hsp_sim >= .99: print("Read %s is %s percent similar to %s" % (qName, hsp_sim * 100, record.alignments[0].hit_def)) if self.minFilter: maxRankLimit = Tree.SPECIES maxRank = maxRankLimit d = maxRankLimit ranks = sfLimits.keys() ranks.sort() ranks.reverse() for rank in ranks: if hsp_sim < sfLimits[rank]: maxRank = rank - 1 else: break while (maxRank < Tree.SPECIES and maxRank < parents[0].getHighestRank()): d = min(parents[0].getHighestRank(), maxRankLimit) if verbose: print( "Read %s cannot be assigned to " "rank %s (similarity=%s)" % (qName, Tree.depths[d], hsp_sim)) parents = parents[1:] if d < maxRankLimit: novelName = ("Unknown %s %s" % (parents[0].name, Tree.depths[d])) nn = self.getNode(novelName) if nn: novelNode = nn else: depth = parents[0].getHighestRank() + 1 novelNode = Node(novelName, parent=parents[0], depth=depth) self.addNode(novelNode) parents = [novelNode] + parents # Handle assignment read = Read(qName, seq=qSeq) if euk_filter and self.getNode("Eukaryota") in parents: parents = [self.noHits] if datasets: for ds in datasets: ra = read_abundances[ds] if ra > 0: parents[0].assignRead(read, dataset=ds, abundance=ra, primary=True, recursive=True) else: parents[0].assignRead(read, dataset=None, abundance=ra, primary=True, recursive=True) self.read_node_assignments[qName] = parents[0] #Below min. score elif read_abundances: #No hits if self.seqs and qName in self.seqs.keys(): qSeq = self.seqs[qName] elif record.alignments: qSeq = record.alignments[0].hsps[0].query.replace("-", "") else: qSeq = None nhr = Read(name=qName, seq=qSeq) if datasets: for ds in datasets: ra = read_abundances[ds] if ra > 0: self.noHits.assignRead(nhr, dataset=ds, abundance=ra, primary=True) self.root.assignRead(nhr, dataset=ds, abundance=ra, primary=False) else: self.noHits.assignRead(nhr, dataset=ds, abundance=ra, primary=True) self.root.assignRead(nhr, dataset=ds, abundance=ra, primary=False) self.read_node_assignments[qName] = self.noHits def setBitscoreRange(self, percent): self.bsr = 1 - float(percent) / 100 def setMinScore(self, minScore): self.ms = minScore def printAssignmentsRDPQual(self, node, dataset=None, printFile=None, newTabStyle=False): assignments = node.getAssignment(dataset) if assignments and assignments.primReads: for r in assignments.primReads: toPrint = (">%s\t%s\n" % (r.name, node.getPhylogenyRDPStyle( root=False, newTabStyle=newTabStyle))) for line in self.qual[str( r.name)].format("qual").split("\n")[1:]: toPrint += (line + " ") if printFile: printFile.write(toPrint + "\n") else: print toPrint if assignments: for child in node.children: self.printAssignmentsRDPQual(node=child, dataset=dataset, printFile=printFile, newTabStyle=newTabStyle) def writeBIOMTable(self, bTable, biomOut): taxonomy_dict = {} for obs in bTable.ObservationIds: try: aNode = self.read_node_assignments[obs] name_list = aNode.getPhylogenyNameList() taxonomy_dict[obs] = {"taxonomy": name_list} except: sys.stderr.write("Problem with assignment of", obs, ": classification not found!") taxonomy_dict[obs] = { "taxonomy": self.noHits.getPhylogenyNameList() } bTable.addObservationMetadata(taxonomy_dict) biomOut.write(bTable.getBiomFormatJsonString("CREST")) def printAllSequences(self, fastaOut): for seqName, node in self.read_node_assignments.iteritems(): try: sn = self.seqs[seqName] except: sn = self.seqs[seqName[:seqName.find("_")]] fastaOut.write( ">%s %s\n%s\n" % (seqName, node.getPhylogenyRDPStyle(root=False), sn)) def writeOTUsWithAssignments(self, otusOut, abundances, datasets, sep="\t"): otusOut.write("OTU" + sep) for ds in datasets: otusOut.write(ds + sep) otusOut.write("classification\n") for seqName in abundances.keys(): otusOut.write(seqName) for ab in abundances[seqName]: otusOut.write("%s%i" % (sep, ab)) taxonomy = self.read_node_assignments[ seqName].getPhylogenyRDPStyle(root=False) otusOut.write("%s%s\n" % (sep, taxonomy))
if len(sys.argv) < 3: print ("Use qiimeAssignmentParse.py rdp_out_file confidence " "(0-1.0) ['tree'] ") else: tree = False mt = RDPQIIMETree(name=sys.argv[1]) for i in range(3, len(sys.argv)): if sys.argv[i] == 'tree': tree = True mt.parseQIIME(sys.argv[1], confidence=(sys.argv[2])) # (not tree)) mt.pruneUnassigned() if tree: dn = [] for n in mt.root.children: dn.append(n) co = Node(parent=mt.root, name="Cellular organisms", population=0, reads=[], singles=0, doubles=0) for d in dn: mt.moveNode(d, co) mt.printAsTree() else: for level in RDPTree.depths.keys(): print("Assingments at %s level" % RDPTree.depths[level]) print mt.printPopulationsAtDepth(level, normaliseToBase=False) print