Beispiel #1
0
    def __init__(self,
                 name,
                 minFilter=True,
                 fastafile=None,
                 qualfile=None,
                 otus=None):
        Tree.__init__(self, Node(name="root", nodeID=1), name=name)
        self.bsr = BSR_DEFAULT
        self.ms = MS_DEFAULT
        self.minFilter = minFilter
        self.noHits = Node("No hits", parent=self.root)
        self.addNode(self.noHits)
        self.seqs = {}
        self.qual = {}
        self.read_node_assignments = {}

        if fastafile:
            fstream = open(fastafile, 'r')
            for seq_record in SeqIO.parse(fstream, "fasta"):
                self.seqs[seq_record.id] = seq_record.seq

        if qualfile:
            qstream = open(qualfile, 'r')
            for q_record in SeqIO.parse(qstream, "qual"):
                self.qual[q_record.id] = q_record
Beispiel #2
0
 def __init__(self, name=None):
     Tree.__init__(self, Node(name="root", nodeID=1), name)
     self.cellOrg = Node(name="Cellular organisms",
                         parent=self.root, nodeID=2)
     self.addNode(self.cellOrg)
     self.idCount = 10000000
     self.problemNodes = []
     self.rejected = []
     self.phylumNames = []
     self.classNames = []
     self.orderNames = []
     self.familyNames = []
     self.genusNames = []
     
     try:
         rankDir = (os.environ['LCATaxonomyDir'] + "/ranks/")
 
         ncbiRanks = {rankDir + "all_ncbi_phylum": self.phylumNames,
                      rankDir + "all_ncbi_class": self.classNames,
                      rankDir + "all_ncbi_order": self.orderNames,
                      rankDir + "all_ncbi_family": self.familyNames,
                      rankDir + "all_ncbi_genus": self.genusNames}
 
         for filename in ncbiRanks.keys():
             readFile = open(filename, 'r')
             for line in readFile:
                 ncbiRanks[filename].append(line[:-1])
             readFile.close()
     except: 
         pass
    def parseQIIME(self, outFile, confidence=0.5, weighFlows=True):

        outFile = open(outFile, 'r')
        i = 0

        #Format:
        #IO95TX02.787F-MID-1_s60_c01_T400_s30_c08_2_3
        #Bacteria;Proteobacteria;Deltaproteobacteria;Desulfobulbaceae    0.82

        for line in outFile:
            #Add read
            #print line
            sl = line.split("\t")
            rName = sl[0]
            taxa = sl[1].split(";")
            conf = sl[2]

            read = RDPRead(name=rName, reverse=None)
            node = parent = self.root

            levels = len(taxa)
            for i in range(levels):
                nodeName = taxa[i].replace("\"", "")
                p = parent
                while p is not self.root:
                    if nodeName == p.name:
                        nodeName += " (%s)" % p.name
                        break
                    else:
                        p = p.parent

                if conf >= confidence:
                    node = self.getNode(nodeName)
                    if not node:
                        node = Node(parent=parent, name=nodeName, depth=i)
                        self.addNode(node)
                    parent = node
            node.assignRead(read, primary=True, recursive=True)
Beispiel #4
0
    def __init__(self, name, minFilter=True, fastafile=None, qualfile=None, otus=None):
        Tree.__init__(self, Node(name="root", nodeID=1), name=name)
        self.bsr = BSR_DEFAULT
        self.ms = MS_DEFAULT
        self.minFilter = minFilter
        self.noHits = Node("No hits", parent=self.root)
        self.addNode(self.noHits)
        self.seqs = {}
        self.qual = {}
        self.read_node_assignments={}

        if fastafile:
            fstream = open(fastafile, 'r')
            for seq_record in SeqIO.parse(fstream, "fasta"):
                self.seqs[seq_record.id] = seq_record.seq
                
        if qualfile:
            qstream = open(qualfile, 'r')
            for q_record in SeqIO.parse(qstream, "qual"):
                self.qual[q_record.id] = q_record
Beispiel #5
0
    def processChangesMetadata(self, line):
        """Process one line of metadata update in tab-sep format"""
        try:
            parts = line.split(";")
            #process parts
            oldName = parts[0]
            newName = parts[1]
            newParent = parts[2]
            rank = parts[3]
        except:
            sys.stderr.write("Warning: Incorrect manual change line: %s\n" %
                             line)
            return

        parent = self.getNode(newParent)
        if newParent and not parent:
            sys.stderr.write("Cannot find parent node %s\n" % newParent)
            return

        #___Fix depth___
        depth = 0
        #if rank is specified: use this
        if rank:
            for d in ARBor.depths.keys():
                if ARBor.depths[d] == rank:
                    depth = d
                    break

        elif parent:
            depth = parent.depth + 1

        #____

        # Add new node
        if not oldName:
            if self.getNode(newName):
                print "Already present: %s" % newName
            else:
                if not parent:
                    print ("Warning: Cannot add taxon %s as parent %s "
                           "does not exist" % (newName, newParent))
                else:
                    newNode = Node(name=newName, parent=parent,
                                   nodeID=self.newID(), depth=depth)
                    self.addNode(newNode)
                    print "Added new taxon: %s" % newName

        # Remove node
        elif not newName and not newParent:
            if not self.getNode(oldName):
                print "Already deleted: %s" % oldName
            else:
                self.deleteNode(self.getNode(oldName), False)
                print "Deleted taxon: %s" % oldName

        # Move or rename taxon
        elif not ".." in newName:
            n = self.getNode(oldName)
            if not self.getNode(oldName):
                if self.getNode(newName):
                    print "Already moved / renamed: %s" % oldName
                else:
                    sys.stderr.write("Cannot find node: %s\n" % oldName)
            else:
                if newName:
                    self.renameNode(n, newName)
                    print "Renamed %s to %s" % (oldName, newName)
                if newParent and not (newParent == n.parent.name):
                    print ("Moving %s from %s to %s" %
                           (oldName, n.parent.name, newParent))
                    self.moveNode(n, self.getNode(newParent))
                    n.depth = depth

        #Control shorthand annotation with ..
        else:
            twoNodes = oldName.split("..")
            firstParentName = self.getNode(twoNodes[0]).parent.name
            secParentName = self.getNode(twoNodes[1]).parent.name
            if not (firstParentName == parent and secParentName == parent):
                sys.stderr.write("Warning: Taxons %s not moved properly in "
                                 "NDS file!!\n" % oldName)
Beispiel #6
0
    def _readNDSLine(self, line, eukaryotic=True, altTax=True, GGMode=True):
        parts = line.split("\t")
        accession = parts[0]
        if "." in accession:
            accession = accession[:accession.find(".")]
        taxonomy = parts[1]
        ncbi_name = parts[2]
        plast = False
        mito = False
        if (eukaryotic and len(parts) > 3):
            # or (GGMode and "Chloroplast" in taxonomy)
            if "Chloroplast" in parts[3]:
                plast = True
            elif "mitochondria" in parts[3].lower():
                mito = True


        parent = self.cellOrg
        depth = Tree.META

        if len(parts) < 1:
            print "Problem:\n%s" % line
            return
        if GGMode:
            taxonomy = taxonomy.replace("; ", ";")
            taxa = re.split('[/;]', taxonomy)
        else:
            taxa = re.split('[_/;]', taxonomy)
        ncbi_name = ncbi_name.replace("\n", "")

        # Do not use eukaryotic reads in non-eukaryotic mode
        if not eukaryotic and not GGMode:
            if taxa[0] == "Eukarya" or ("Chloroplast" in taxa) \
                or ("mitochondria" in taxa):
                self.rejected.append(accession)
                return

        # Ignore uncultured groups and handle like clustered to parent.
        if taxa[-1] == "uncultured":
            taxa = taxa[:-1]

        # Alt tax. fix
        if eukaryotic and altTax:
            if accession in self.rejected:
                del self.rejected[self.rejected.index(accession)]
            alt = ["Eukaryota"]
            if plast or (" plastid" in ncbi_name.lower()) or \
            ("chloroplast" in ncbi_name.lower()):
                alt.append("Plastid")
                plast = True
            elif  mito or ("mitochondrion" in ncbi_name.lower()):
                alt.append("Mitochondrion")
                mito = True
            else:
                alt.append("Nucleus")

            #Put extra labels on all childs of these new groups
            extra = ""
            if mito:
                extra = " (Mitochondrion)"
            elif plast:
                extra = " (Plastid)"
            for taxon in taxa[1:]:
                if len(taxon) > 0:
                    alt.append(taxon + extra)
            taxa = alt

        #Use NCBI Taxonomy species name if meaningful
        species = True
        for key in ARBor.nonSpeciesKeys:
            if key in ncbi_name:
                species = False
                break

        if species:
            if eukaryotic:
                if (plast or mito or
                    (" plastid" in ncbi_name.lower()) or
                    (" mitochondrion" in ncbi_name.lower())):
                    name_only = ncbi_name.replace(" Plastid", "")
                    name_only = name_only.replace(" plastid", "")
                    name_only = name_only.replace(" Mitochondrion", "")
                    name_only = name_only.replace(" mitochondrion", "")
                else:
                    name_only = ncbi_name
                    ncbi_name = name_only + " nucleus"
                if not altTax:
                    taxa.append(name_only)

            taxa.append(ncbi_name)
            # --- Remove line to use synonyms file instead
            taxa.append(accession)  # Makes synoynms file redundant.
            # ---
        else:
            #Not species
            taxa.append(accession)  # Remove line to use synonyms file instead
        
        i = 0
        parents = []
        for taxon in taxa:
            # If we are in GGMode then there will be a species name and accession 
            #that should not be touched:
            if (GGMode and len(taxon) > 3 and not
                ((taxon is taxa[-1]) or (taxon is taxa[-2] and species))):
                
                GGfix = True
                taxon = taxon[3:]
                dsym = taxon[0]
            else:
                GGfix = False

            #Check if taxon is numerical or empty
            if len(re.findall('[0-9]', taxon)) == len(taxon) and not GGMode:
                pass
            else:
                depth += 1

                #Fix parent of self ambigousity error
                if taxon in parents:
                    if depth > Tree.SPECIES or taxa[0] == 'Eukarya':
                        taxon += " (group)"
                    else:
                        taxon += " (%s)" % Tree.depths[depth]

                #Fix incertae sedis only issues (still in 106)
                if taxon == "Incertae Sedis" or taxon == "Incertae_sedis":
                    taxon = "%s Incertae Sedis" % parent.name

                #Find or create node
                node = self.getNode(taxon)
                if node:
                    # Fix parent conflicts
                    if node.parent is not parent:
                        if node not in self.problemNodes:
                            sys.stderr.write("Warning: Conflicting placement "
                                             "of node %s" % taxon)
                            sys.stderr.write("( parent: %s) Parent of "
                                             "existing: %s\n\n" %
                                             (node.parent, parent))
                            self.problemNodes.append(node)

                        # Two accession number nodes with different taxonomy
                        # not allowed. Skip these.
                        if taxon == accession:
                            return

                        # Otherwise try to find different name
                        altname = "%s (%s)" % (taxon, parent.name)
                        node = self.getNode(altname)
                        alt = 2

                        # Backup plan if we are still not ok
                        while node and not node.parent is parent:
                            altname = "%s (%s)" % (taxon, alt)
                            alt += 1
                            node = self.getNode(altname)

                        # Add new node with alternative name.
                        if not node:
                            node = Node(name=altname, parent=parent,
                                        nodeID=self.newID(), depth=depth)
                            self.addNode(node)
                    else:
                        depth = node.getHighestRank()
                else:
                    # Fix depth issues
                    rank = depth

                    if (taxon == "Plastid" or
                        taxon == "Nucleus" or
                        taxon == "Mitochondrion"):
                        rank = Tree.NORANK
                        depth = Tree.DOMAIN

                    #Find depth
                    elif taxon is taxa[-1] and depth < Tree.SPECIES:
                        if not species:
                            rank = Tree.SPECIES
                        else:
                            rank = Tree.SUBSPECIES
                    elif (taxon is taxa[-2] and
                          species and
                          depth < Tree.SPECIES):
                        rank = Tree.SPECIES
                    elif taxon is taxa[0]:
                        depth = rank = Tree.DOMAIN
                    elif GGfix:
                        if dsym == "k":
                            rank = Tree.DOMAIN
                        elif dsym == "p":
                            rank = Tree.PHYLUM
                        elif dsym == "c":
                            rank = Tree.CLASS
                        elif dsym == "o":
                            rank = Tree.ORDER
                        elif dsym == "f":
                            rank = Tree.FAMILY
                        elif dsym == "g":
                            rank = Tree.GENUS
                        elif dsym =="s":
                            rank = Tree.SPECIES
                    elif taxon in self.phylumNames:
                        depth = rank = Tree.PHYLUM
                    elif taxon in self.classNames:
                        depth = rank = Tree.CLASS
                    elif taxon in self.orderNames or taxon[-4:] == "ales":
                        depth = rank = Tree.ORDER
                    elif taxon in self.familyNames or taxon[-4:] == "ceae":
                        depth = rank = Tree.FAMILY
                    elif taxon in self.genusNames:
                        depth = rank = Tree.GENUS

                    p = parent
                    if not GGMode:
                        while (rank > Tree.META and
                               p.depth and
                               p.depth >= rank):
                            p.depth = Tree.NORANK
                            if p.parent:
                                p = p.parent
                                

                    #Add node
                    node = Node(name=taxon, parent=parent,
                                nodeID=self.newID(), depth=rank)
                    self.addNode(node)

                parent = node
                parents.append(node.name)

                if taxon is taxa[-1]:
                    #Associate accession with taxa
                    node.assignRead(Read(accession))
                i += 1
Beispiel #7
0
class LCAClassifier(Tree):
    """A classifier instance inherits from the standard Tree and assigns
    reads to it representing the sequence reads of the classified dataset"""

    def __init__(self, name, minFilter=True, fastafile=None, qualfile=None, otus=None):
        Tree.__init__(self, Node(name="root", nodeID=1), name=name)
        self.bsr = BSR_DEFAULT
        self.ms = MS_DEFAULT
        self.minFilter = minFilter
        self.noHits = Node("No hits", parent=self.root)
        self.addNode(self.noHits)
        self.seqs = {}
        self.qual = {}
        self.read_node_assignments={}

        if fastafile:
            fstream = open(fastafile, 'r')
            for seq_record in SeqIO.parse(fstream, "fasta"):
                self.seqs[seq_record.id] = seq_record.seq
                
        if qualfile:
            qstream = open(qualfile, 'r')
            for q_record in SeqIO.parse(qstream, "qual"):
                self.qual[q_record.id] = q_record
                
        #TODO Process tab- or comma-separated otu-file

    def assign(self, records, datasets=None, abundances=None, verbose=False, 
               euk_filter=False):
        """Accepts a of biopython blast iterator and carries out LCA
        assignments to a given dataset. If abundances given, it must have same order
        for all lists as the datasets list"""

        for record in records:
            qName = record.query.split(" ")[0]
            
            read_abundances={}
            
            #Determine read population from from otus if given
            if abundances:
                try:
                    i=0
                    if qName in abundances.keys():
                        seq_abundances=abundances[qName]
                    else:                   
                        qFix = qName[:qName.find("_")]
                        seq_abundances=abundances[qFix]
                        
                    for ds in datasets:
                        read_abundances[ds] = seq_abundances[i]
                        i+=1
                except:
                    print "Warning: Cannot find %s in OTU table!" %qName
            
            #Else determine read population from its name / annotation.
            else:
                if "_" in qName:
                    try:
                        readPopulation = int(qName.split("_")[-1].replace(".00",
                                                                          ""))
                    except:
                        readPopulation = 1
                elif "numreads=" in qName:
                    readPopulation = int(qName[qName.find("numreads=") +
                                               len("numreads="):])
                    
                elif "size=" in qName:
                    readPopulation = int(qName[qName.find("size=") +
                                               len("size="):-1])
                #Or set to 1, if we cannot find
                else:
                    readPopulation = 1
                
                if datasets:
                    for ds in datasets:
                        read_abundances[ds]=readPopulation
                else:
                    ra=readPopulation
            
            # Check for minimum score and any alignemnts
            if (record.alignments and read_abundances and
                record.alignments[0].hsps[0].bits >= self.ms):
                
                best_hsp = record.alignments[0].hsps[0]
                topScore = best_hsp.bits
                if self.seqs and qName in self.seqs.keys():
                    qSeq = self.seqs[qName]
                else:
                    qSeq = str(best_hsp.query).replace("-", "")
                hitname = record.alignments[0].hit_def.split()[0]
                node = self.getNode(hitname)
                if not node:
                    sys.stderr.write("Best-scoring node %s not found!\n" %
                                     hitname)
                    sys.stderr.write("Cannot assign read %s\n" % qName)
                else:
                    parents = node.getPhylogeny()[1:]

                    # Iterate through rest of hits until falling below treshold
                    for a in record.alignments[1:]:
                        if a.hsps[0].bits < float(topScore) * self.bsr:
                            break
                        hitname = a.hit_def.split()[0]
                        n = self.getNode(hitname)

                        if not n:
                            sys.stderr.write("Node " + hitname +
                                             " not found! Ignoring.\n")
                        else:
                            p = n.parent
                            # iterate through parents until found in the
                            # parents list
                            while p not in parents:
                                p = p.parent
                            parents = parents[parents.index(p):]

                    # Take a look at similarity, print info if verbose and
                    # kick up if filter
                    hsp_sim = (float(best_hsp.identities) /
                               float(best_hsp.align_length))
                    if verbose and hsp_sim >= .99:
                        print ("Read %s is %s percent similar to %s" %
                               (qName, hsp_sim * 100,
                                record.alignments[0].hit_def))

                    if self.minFilter:
                        maxRankLimit = Tree.SPECIES
                        maxRank = maxRankLimit
                        d = maxRankLimit
                        ranks = sfLimits.keys()
                        ranks.sort()
                        ranks.reverse()
                        for rank in ranks:
                            if hsp_sim < sfLimits[rank]:
                                maxRank = rank - 1
                            else:
                                break

                        while (maxRank < Tree.SPECIES and
                               maxRank < parents[0].getHighestRank()):
                            d = min(parents[0].getHighestRank(), maxRankLimit)
                            if verbose:
                                print ("Read %s cannot be assigned to "
                                       "rank %s (similarity=%s)" %
                                       (qName, Tree.depths[d],
                                        hsp_sim))
                            parents = parents[1:]

                        if d < maxRankLimit:
                            novelName = ("Unknown %s %s" %
                                         (parents[0].name, Tree.depths[d]))
                            nn = self.getNode(novelName)
                            if nn:
                                novelNode = nn
                            else:
                                depth = parents[0].getHighestRank() + 1
                                novelNode = Node(novelName, parent=parents[0],
                                                 depth=depth)
                                self.addNode(novelNode)
                            parents = [novelNode] + parents

                    # Handle assignment
                    read = Read(qName, seq=qSeq)
                    
                    if euk_filter and self.getNode("Eukaryota") in parents:
                        parents = [self.noHits]
                    
                    if datasets:
                        for ds in datasets:
                            ra=read_abundances[ds]
                            if ra>0:
                                parents[0].assignRead(read, dataset=ds, 
                                                  abundance=ra,
                                                  primary=True, recursive=True)
                    else: 
                        parents[0].assignRead(read, dataset=None, 
                                                  abundance=ra,
                                                  primary=True, recursive=True)
                    self.read_node_assignments[qName] = parents[0]
            
            #Below min. score
            elif read_abundances:
                #No hits
                if self.seqs and qName in self.seqs.keys():
                    qSeq = self.seqs[qName]
                elif record.alignments:
                    qSeq = record.alignments[0].hsps[0].query.replace("-", "")
                else:
                    qSeq = None
                nhr = Read(name=qName, seq=qSeq)
                if datasets:
                    for ds in datasets:
                        ra=read_abundances[ds]
                        if ra>0:
                            self.noHits.assignRead(nhr, dataset=ds, abundance=ra,
                                                primary=True)
                            self.root.assignRead(nhr, dataset=ds, abundance=ra, 
                                             primary=False)
                else:
                    self.noHits.assignRead(nhr, dataset=ds, abundance=ra,
                                                primary=True)
                    
                    self.root.assignRead(nhr, dataset=ds, abundance=ra, 
                                             primary=False)
                    
                self.read_node_assignments[qName] = self.noHits


    def setBitscoreRange(self, percent):
        self.bsr = 1 - float(percent) / 100

    def setMinScore(self, minScore):
        self.ms = minScore
        
    def printAssignmentsRDPQual(self, node, dataset=None, printFile=None,
                                 newTabStyle=False):
        assignments = node.getAssignment(dataset)
        if assignments and assignments.primReads:

            for r in assignments.primReads:
                toPrint = (">%s\t%s\n" %
                           (r.name,
                            node.getPhylogenyRDPStyle(
                                      root=False, newTabStyle=newTabStyle)))
                for line in self.qual[str(r.name)].format("qual").split("\n")[1:]:
                    toPrint+=(line + " ")
                if printFile:
                        printFile.write(toPrint + "\n")
                else:
                    print toPrint
        if assignments:
            for child in node.children:
                self.printAssignmentsRDPQual(node=child, dataset=dataset, 
                                              printFile=printFile,
                                              newTabStyle=newTabStyle)
                
    def writeBIOMTable(self, bTable,biomOut):
 
        taxonomy_dict = {}
        for obs in bTable.ObservationIds:
            try:
                aNode = self.read_node_assignments[obs]
                name_list = aNode.getPhylogenyNameList()
                taxonomy_dict[obs] = {"taxonomy":name_list}
            except:
                sys.stderr.write("Problem with assignment of",obs,": classification not found!")
                taxonomy_dict[obs] = {"taxonomy":self.noHits.getPhylogenyNameList()}
        bTable.addObservationMetadata(taxonomy_dict)
        biomOut.write(bTable.getBiomFormatJsonString("CREST"))
        
    def printAllSequences(self, fastaOut):
        for seqName, node in self.read_node_assignments.iteritems():
            try:
                sn = self.seqs[seqName]
            except:
                sn = self.seqs[seqName[:seqName.find("_")]]
                
            fastaOut.write(">%s %s\n%s\n" % (seqName, 
                                           node.getPhylogenyRDPStyle(root=False), 
                                           sn))
            
            
    def writeOTUsWithAssignments(self, otusOut, abundances, datasets, sep="\t"):
        otusOut.write("OTU"+sep)
        for ds in datasets:
            otusOut.write(ds+sep)
        otusOut.write("classification\n")
        for seqName in abundances.keys():
            otusOut.write(seqName)
            for ab in abundances[seqName]:
                otusOut.write("%s%i" % (sep, ab))
            taxonomy = self.read_node_assignments[seqName].getPhylogenyRDPStyle(root=False)
            otusOut.write("%s%s\n" % (sep, taxonomy))
Beispiel #8
0
    def assign(self,
               records,
               datasets=None,
               abundances=None,
               verbose=False,
               euk_filter=False):
        """Accepts a of biopython blast iterator and carries out LCA
        assignments to a given dataset. If abundances given, it must have same order
        for all lists as the datasets list"""

        for record in records:
            qName = record.query.split(" ")[0]

            read_abundances = {}

            #Determine read population from from otus if given
            if abundances:
                try:
                    i = 0
                    if qName in abundances.keys():
                        seq_abundances = abundances[qName]
                    else:
                        qFix = qName[:qName.find("_")]
                        seq_abundances = abundances[qFix]

                    for ds in datasets:
                        read_abundances[ds] = seq_abundances[i]
                        i += 1
                except:
                    print "Warning: Cannot find %s in OTU table!" % qName

            #Else determine read population from its name / annotation.
            else:
                if "_" in qName:
                    try:
                        readPopulation = int(
                            qName.split("_")[-1].replace(".00", ""))
                    except:
                        readPopulation = 1
                elif "numreads=" in qName:
                    readPopulation = int(qName[qName.find("numreads=") +
                                               len("numreads="):])

                elif "size=" in qName:
                    readPopulation = int(qName[qName.find("size=") +
                                               len("size="):-1])
                #Or set to 1, if we cannot find
                else:
                    readPopulation = 1

                if datasets:
                    for ds in datasets:
                        read_abundances[ds] = readPopulation
                else:
                    ra = readPopulation

            # Check for minimum score and any alignemnts
            if (record.alignments and read_abundances
                    and record.alignments[0].hsps[0].bits >= self.ms):

                best_hsp = record.alignments[0].hsps[0]
                topScore = best_hsp.bits
                if self.seqs and qName in self.seqs.keys():
                    qSeq = self.seqs[qName]
                else:
                    qSeq = str(best_hsp.query).replace("-", "")
                hitname = record.alignments[0].hit_def.split()[0]
                node = self.getNode(hitname)
                if not node:
                    sys.stderr.write("Best-scoring node %s not found!\n" %
                                     hitname)
                    sys.stderr.write("Cannot assign read %s\n" % qName)
                else:
                    parents = node.getPhylogeny()[1:]

                    # Iterate through rest of hits until falling below treshold
                    for a in record.alignments[1:]:
                        if a.hsps[0].bits < float(topScore) * self.bsr:
                            break
                        hitname = a.hit_def.split()[0]
                        n = self.getNode(hitname)

                        if not n:
                            sys.stderr.write("Node " + hitname +
                                             " not found! Ignoring.\n")
                        else:
                            p = n.parent
                            # iterate through parents until found in the
                            # parents list
                            while p not in parents:
                                p = p.parent
                            parents = parents[parents.index(p):]

                    # Take a look at similarity, print info if verbose and
                    # kick up if filter
                    hsp_sim = (float(best_hsp.identities) /
                               float(best_hsp.align_length))
                    if verbose and hsp_sim >= .99:
                        print("Read %s is %s percent similar to %s" %
                              (qName, hsp_sim * 100,
                               record.alignments[0].hit_def))

                    if self.minFilter:
                        maxRankLimit = Tree.SPECIES
                        maxRank = maxRankLimit
                        d = maxRankLimit
                        ranks = sfLimits.keys()
                        ranks.sort()
                        ranks.reverse()
                        for rank in ranks:
                            if hsp_sim < sfLimits[rank]:
                                maxRank = rank - 1
                            else:
                                break

                        while (maxRank < Tree.SPECIES
                               and maxRank < parents[0].getHighestRank()):
                            d = min(parents[0].getHighestRank(), maxRankLimit)
                            if verbose:
                                print(
                                    "Read %s cannot be assigned to "
                                    "rank %s (similarity=%s)" %
                                    (qName, Tree.depths[d], hsp_sim))
                            parents = parents[1:]

                        if d < maxRankLimit:
                            novelName = ("Unknown %s %s" %
                                         (parents[0].name, Tree.depths[d]))
                            nn = self.getNode(novelName)
                            if nn:
                                novelNode = nn
                            else:
                                depth = parents[0].getHighestRank() + 1
                                novelNode = Node(novelName,
                                                 parent=parents[0],
                                                 depth=depth)
                                self.addNode(novelNode)
                            parents = [novelNode] + parents

                    # Handle assignment
                    read = Read(qName, seq=qSeq)

                    if euk_filter and self.getNode("Eukaryota") in parents:
                        parents = [self.noHits]

                    if datasets:
                        for ds in datasets:
                            ra = read_abundances[ds]
                            if ra > 0:
                                parents[0].assignRead(read,
                                                      dataset=ds,
                                                      abundance=ra,
                                                      primary=True,
                                                      recursive=True)
                    else:
                        parents[0].assignRead(read,
                                              dataset=None,
                                              abundance=ra,
                                              primary=True,
                                              recursive=True)
                    self.read_node_assignments[qName] = parents[0]

            #Below min. score
            elif read_abundances:
                #No hits
                if self.seqs and qName in self.seqs.keys():
                    qSeq = self.seqs[qName]
                elif record.alignments:
                    qSeq = record.alignments[0].hsps[0].query.replace("-", "")
                else:
                    qSeq = None
                nhr = Read(name=qName, seq=qSeq)
                if datasets:
                    for ds in datasets:
                        ra = read_abundances[ds]
                        if ra > 0:
                            self.noHits.assignRead(nhr,
                                                   dataset=ds,
                                                   abundance=ra,
                                                   primary=True)
                            self.root.assignRead(nhr,
                                                 dataset=ds,
                                                 abundance=ra,
                                                 primary=False)
                else:
                    self.noHits.assignRead(nhr,
                                           dataset=ds,
                                           abundance=ra,
                                           primary=True)

                    self.root.assignRead(nhr,
                                         dataset=ds,
                                         abundance=ra,
                                         primary=False)

                self.read_node_assignments[qName] = self.noHits
Beispiel #9
0
class LCAClassifier(Tree):
    """A classifier instance inherits from the standard Tree and assigns
    reads to it representing the sequence reads of the classified dataset"""
    def __init__(self,
                 name,
                 minFilter=True,
                 fastafile=None,
                 qualfile=None,
                 otus=None):
        Tree.__init__(self, Node(name="root", nodeID=1), name=name)
        self.bsr = BSR_DEFAULT
        self.ms = MS_DEFAULT
        self.minFilter = minFilter
        self.noHits = Node("No hits", parent=self.root)
        self.addNode(self.noHits)
        self.seqs = {}
        self.qual = {}
        self.read_node_assignments = {}

        if fastafile:
            fstream = open(fastafile, 'r')
            for seq_record in SeqIO.parse(fstream, "fasta"):
                self.seqs[seq_record.id] = seq_record.seq

        if qualfile:
            qstream = open(qualfile, 'r')
            for q_record in SeqIO.parse(qstream, "qual"):
                self.qual[q_record.id] = q_record

        #TODO Process tab- or comma-separated otu-file

    def assign(self,
               records,
               datasets=None,
               abundances=None,
               verbose=False,
               euk_filter=False):
        """Accepts a of biopython blast iterator and carries out LCA
        assignments to a given dataset. If abundances given, it must have same order
        for all lists as the datasets list"""

        for record in records:
            qName = record.query.split(" ")[0]

            read_abundances = {}

            #Determine read population from from otus if given
            if abundances:
                try:
                    i = 0
                    if qName in abundances.keys():
                        seq_abundances = abundances[qName]
                    else:
                        qFix = qName[:qName.find("_")]
                        seq_abundances = abundances[qFix]

                    for ds in datasets:
                        read_abundances[ds] = seq_abundances[i]
                        i += 1
                except:
                    print "Warning: Cannot find %s in OTU table!" % qName

            #Else determine read population from its name / annotation.
            else:
                if "_" in qName:
                    try:
                        readPopulation = int(
                            qName.split("_")[-1].replace(".00", ""))
                    except:
                        readPopulation = 1
                elif "numreads=" in qName:
                    readPopulation = int(qName[qName.find("numreads=") +
                                               len("numreads="):])

                elif "size=" in qName:
                    readPopulation = int(qName[qName.find("size=") +
                                               len("size="):-1])
                #Or set to 1, if we cannot find
                else:
                    readPopulation = 1

                if datasets:
                    for ds in datasets:
                        read_abundances[ds] = readPopulation
                else:
                    ra = readPopulation

            # Check for minimum score and any alignemnts
            if (record.alignments and read_abundances
                    and record.alignments[0].hsps[0].bits >= self.ms):

                best_hsp = record.alignments[0].hsps[0]
                topScore = best_hsp.bits
                if self.seqs and qName in self.seqs.keys():
                    qSeq = self.seqs[qName]
                else:
                    qSeq = str(best_hsp.query).replace("-", "")
                hitname = record.alignments[0].hit_def.split()[0]
                node = self.getNode(hitname)
                if not node:
                    sys.stderr.write("Best-scoring node %s not found!\n" %
                                     hitname)
                    sys.stderr.write("Cannot assign read %s\n" % qName)
                else:
                    parents = node.getPhylogeny()[1:]

                    # Iterate through rest of hits until falling below treshold
                    for a in record.alignments[1:]:
                        if a.hsps[0].bits < float(topScore) * self.bsr:
                            break
                        hitname = a.hit_def.split()[0]
                        n = self.getNode(hitname)

                        if not n:
                            sys.stderr.write("Node " + hitname +
                                             " not found! Ignoring.\n")
                        else:
                            p = n.parent
                            # iterate through parents until found in the
                            # parents list
                            while p not in parents:
                                p = p.parent
                            parents = parents[parents.index(p):]

                    # Take a look at similarity, print info if verbose and
                    # kick up if filter
                    hsp_sim = (float(best_hsp.identities) /
                               float(best_hsp.align_length))
                    if verbose and hsp_sim >= .99:
                        print("Read %s is %s percent similar to %s" %
                              (qName, hsp_sim * 100,
                               record.alignments[0].hit_def))

                    if self.minFilter:
                        maxRankLimit = Tree.SPECIES
                        maxRank = maxRankLimit
                        d = maxRankLimit
                        ranks = sfLimits.keys()
                        ranks.sort()
                        ranks.reverse()
                        for rank in ranks:
                            if hsp_sim < sfLimits[rank]:
                                maxRank = rank - 1
                            else:
                                break

                        while (maxRank < Tree.SPECIES
                               and maxRank < parents[0].getHighestRank()):
                            d = min(parents[0].getHighestRank(), maxRankLimit)
                            if verbose:
                                print(
                                    "Read %s cannot be assigned to "
                                    "rank %s (similarity=%s)" %
                                    (qName, Tree.depths[d], hsp_sim))
                            parents = parents[1:]

                        if d < maxRankLimit:
                            novelName = ("Unknown %s %s" %
                                         (parents[0].name, Tree.depths[d]))
                            nn = self.getNode(novelName)
                            if nn:
                                novelNode = nn
                            else:
                                depth = parents[0].getHighestRank() + 1
                                novelNode = Node(novelName,
                                                 parent=parents[0],
                                                 depth=depth)
                                self.addNode(novelNode)
                            parents = [novelNode] + parents

                    # Handle assignment
                    read = Read(qName, seq=qSeq)

                    if euk_filter and self.getNode("Eukaryota") in parents:
                        parents = [self.noHits]

                    if datasets:
                        for ds in datasets:
                            ra = read_abundances[ds]
                            if ra > 0:
                                parents[0].assignRead(read,
                                                      dataset=ds,
                                                      abundance=ra,
                                                      primary=True,
                                                      recursive=True)
                    else:
                        parents[0].assignRead(read,
                                              dataset=None,
                                              abundance=ra,
                                              primary=True,
                                              recursive=True)
                    self.read_node_assignments[qName] = parents[0]

            #Below min. score
            elif read_abundances:
                #No hits
                if self.seqs and qName in self.seqs.keys():
                    qSeq = self.seqs[qName]
                elif record.alignments:
                    qSeq = record.alignments[0].hsps[0].query.replace("-", "")
                else:
                    qSeq = None
                nhr = Read(name=qName, seq=qSeq)
                if datasets:
                    for ds in datasets:
                        ra = read_abundances[ds]
                        if ra > 0:
                            self.noHits.assignRead(nhr,
                                                   dataset=ds,
                                                   abundance=ra,
                                                   primary=True)
                            self.root.assignRead(nhr,
                                                 dataset=ds,
                                                 abundance=ra,
                                                 primary=False)
                else:
                    self.noHits.assignRead(nhr,
                                           dataset=ds,
                                           abundance=ra,
                                           primary=True)

                    self.root.assignRead(nhr,
                                         dataset=ds,
                                         abundance=ra,
                                         primary=False)

                self.read_node_assignments[qName] = self.noHits

    def setBitscoreRange(self, percent):
        self.bsr = 1 - float(percent) / 100

    def setMinScore(self, minScore):
        self.ms = minScore

    def printAssignmentsRDPQual(self,
                                node,
                                dataset=None,
                                printFile=None,
                                newTabStyle=False):
        assignments = node.getAssignment(dataset)
        if assignments and assignments.primReads:

            for r in assignments.primReads:
                toPrint = (">%s\t%s\n" %
                           (r.name,
                            node.getPhylogenyRDPStyle(
                                root=False, newTabStyle=newTabStyle)))
                for line in self.qual[str(
                        r.name)].format("qual").split("\n")[1:]:
                    toPrint += (line + " ")
                if printFile:
                    printFile.write(toPrint + "\n")
                else:
                    print toPrint
        if assignments:
            for child in node.children:
                self.printAssignmentsRDPQual(node=child,
                                             dataset=dataset,
                                             printFile=printFile,
                                             newTabStyle=newTabStyle)

    def writeBIOMTable(self, bTable, biomOut):

        taxonomy_dict = {}
        for obs in bTable.ObservationIds:
            try:
                aNode = self.read_node_assignments[obs]
                name_list = aNode.getPhylogenyNameList()
                taxonomy_dict[obs] = {"taxonomy": name_list}
            except:
                sys.stderr.write("Problem with assignment of", obs,
                                 ": classification not found!")
                taxonomy_dict[obs] = {
                    "taxonomy": self.noHits.getPhylogenyNameList()
                }
        bTable.addObservationMetadata(taxonomy_dict)
        biomOut.write(bTable.getBiomFormatJsonString("CREST"))

    def printAllSequences(self, fastaOut):
        for seqName, node in self.read_node_assignments.iteritems():
            try:
                sn = self.seqs[seqName]
            except:
                sn = self.seqs[seqName[:seqName.find("_")]]

            fastaOut.write(
                ">%s %s\n%s\n" %
                (seqName, node.getPhylogenyRDPStyle(root=False), sn))

    def writeOTUsWithAssignments(self,
                                 otusOut,
                                 abundances,
                                 datasets,
                                 sep="\t"):
        otusOut.write("OTU" + sep)
        for ds in datasets:
            otusOut.write(ds + sep)
        otusOut.write("classification\n")
        for seqName in abundances.keys():
            otusOut.write(seqName)
            for ab in abundances[seqName]:
                otusOut.write("%s%i" % (sep, ab))
            taxonomy = self.read_node_assignments[
                seqName].getPhylogenyRDPStyle(root=False)
            otusOut.write("%s%s\n" % (sep, taxonomy))
        if len(sys.argv) < 3:
            print ("Use qiimeAssignmentParse.py rdp_out_file confidence "
                   "(0-1.0) ['tree'] ")
        else:
            tree = False
            mt = RDPQIIMETree(name=sys.argv[1])
            for i in range(3, len(sys.argv)):
                if sys.argv[i] == 'tree':
                    tree = True
            mt.parseQIIME(sys.argv[1], confidence=(sys.argv[2]))  # (not tree))
            mt.pruneUnassigned()

        if tree:
            dn = []
            for n in mt.root.children:
                dn.append(n)

            co = Node(parent=mt.root, name="Cellular organisms", population=0,
                                        reads=[], singles=0, doubles=0)

            for d in dn:
                mt.moveNode(d, co)

            mt.printAsTree()
        else:
            for level in RDPTree.depths.keys():
                print("Assingments at %s level" % RDPTree.depths[level])
                print
                mt.printPopulationsAtDepth(level, normaliseToBase=False)
                print