Example #1
0
    def testBinMarkerSets(self):
        """Verify bin marker set data structure."""

        bms = BinMarkerSets(0, BinMarkerSets.TAXONOMIC_MARKER_SET)

        ms1 = MarkerSet(1, 'k__Bacteria', 100, [set(['a', 'b']), set(['c'])])
        bms.addMarkerSet(ms1)

        ms2 = MarkerSet(2, 'k__Bacteria', 100, [set(['d', 'e']), set(['f'])])
        bms.addMarkerSet(ms2)

        self.assertEqual(bms.getMarkerGenes(),
                         set(['a', 'b', 'c', 'd', 'e', 'f']))
        self.assertEqual(bms.mostSpecificMarkerSet(), ms1)
        self.assertEqual(bms.selectedMarkerSet(), ms1)
Example #2
0
    def buildDomainMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None):   
        """Build domain-specific marker sets for a genome in a LOO-fashion."""
                               
        # determine marker sets for bin      
        binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET)
        refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET)         

        # calculate marker set for bacterial or archaeal node
        uniqueId = curNode.label.split('|')[0] 
        lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]
        
        while curNode != None:
            uniqueId = curNode.label.split('|')[0] 
            if uniqueId != 'UID2' and uniqueId != 'UID203':
                curNode = curNode.parent_node
                continue

            stats = self.uniqueIdToLineageStatistics[uniqueId]
            taxonomyStr = stats['taxonomy']
            if taxonomyStr == '':
                taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics)

            leafNodes = curNode.leaf_nodes()
            genomeIds = set()
            for leaf in leafNodes:
                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))
                
                duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    genomeIds.add(dup.replace('IMG_', ''))

            # remove all genomes from the same taxonomic group as the genome of interest
            if genomeIdsToRemove != None:
                genomeIds.difference_update(genomeIdsToRemove) 

            if len(genomeIds) >= 2:
                if bMarkerSet:
                    markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold)
                else:
                    markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)])
                
                markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1]
                binMarkerSets.addMarkerSet(markerSet)
        
                #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet)
                refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
                #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes()))
                refinedBinMarkerSet.addMarkerSet(refinedMarkerSet)
            
            curNode = curNode.parent_node
                
        return binMarkerSets, refinedBinMarkerSet
Example #3
0
    def testBinMarkerSets(self):
        """Verify bin marker set data structure."""

        bms = BinMarkerSets(0, BinMarkerSets.TAXONOMIC_MARKER_SET)

        ms1 = MarkerSet(1, "k__Bacteria", 100, [set(["a", "b"]), set(["c"])])
        bms.addMarkerSet(ms1)

        ms2 = MarkerSet(2, "k__Bacteria", 100, [set(["d", "e"]), set(["f"])])
        bms.addMarkerSet(ms2)

        self.assertEqual(bms.getMarkerGenes(), set(["a", "b", "c", "d", "e", "f"]))
        self.assertEqual(bms.mostSpecificMarkerSet(), ms1)
        self.assertEqual(bms.selectedMarkerSet(), ms1)
Example #4
0
 def testBinMarkerSets(self):
     """Verify bin marker set data structure."""
     
     bms = BinMarkerSets(0, BinMarkerSets.TAXONOMIC_MARKER_SET)
     
     ms1 = MarkerSet(1, 'k__Bacteria', 100, [set(['a', 'b']), set(['c'])])
     bms.addMarkerSet(ms1)
     
     ms2 = MarkerSet(2, 'k__Bacteria', 100, [set(['d', 'e']), set(['f'])])
     bms.addMarkerSet(ms2)
       
     self.assertEqual(bms.getMarkerGenes(), set(['a', 'b', 'c', 'd', 'e', 'f']))
     self.assertEqual(bms.mostSpecificMarkerSet(), ms1)     
     self.assertEqual(bms.selectedMarkerSet(), ms1) 
Example #5
0
    def markerSet(self, rank, taxon, markerFile):
        """Obtain specified taxonomic-specific marker set."""

        taxonMarkerSets = self.readMarkerSets()


        if rank not in taxonMarkerSets:
            self.logger.error('  Unrecognized taxonomic rank: ' + rank)
            return False
        elif taxon not in taxonMarkerSets[rank]:
            self.logger.error('  Unrecognized taxon: %s (in rank %s): ' % (taxon, rank))
            return False

        markerSet = taxonMarkerSets[rank][taxon]

        taxonomy = markerSet.lineageStr.split(';')[::-1]
        binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET)
        for i, taxon in enumerate(taxonomy):
            if rank != 'life':
                rank = ranksByLevel[len(taxonomy)-i-1]

            if rank == 'species':
                taxon = taxonomy[1] + ' ' + taxonomy[0]

            markerSet = taxonMarkerSets[rank][taxon]
            numMarkers, numMarkerSets = markerSet.size()
            self.logger.info('  Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets))
            self.logger.info('    Marker set inferred from %d reference genomes.' % markerSet.numGenomes)

            markerSet.lineageStr = taxon
            binMarkerSets.addMarkerSet(markerSet)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n')
        binMarkerSets.write(fout)
        fout.close()

        return True
Example #6
0
    def markerSet(self, rank, taxon, markerFile):
        """Obtain specified taxonomic-specific marker set."""

        taxonMarkerSets = self.readMarkerSets()

        if rank not in taxonMarkerSets:
            self.logger.error('  Unrecognized taxonomic rank: ' + rank)
            return False
        elif taxon not in taxonMarkerSets[rank]:
            self.logger.error('  Unrecognized taxon: %s (in rank %s): ' %
                              (taxon, rank))
            return False

        markerSet = taxonMarkerSets[rank][taxon]

        taxonomy = markerSet.lineageStr.split(';')[::-1]
        binMarkerSets = BinMarkerSets(taxon,
                                      BinMarkerSets.TAXONOMIC_MARKER_SET)
        for i, taxon in enumerate(taxonomy):
            if rank != 'life':
                rank = ranksByLevel[len(taxonomy) - i - 1]

            if rank == 'species':
                taxon = taxonomy[1] + ' ' + taxonomy[0]

            markerSet = taxonMarkerSets[rank][taxon]
            numMarkers, numMarkerSets = markerSet.size()
            self.logger.info(
                '  Marker set for %s contains %d marker genes arranged in %d sets.'
                % (taxon, numMarkers, numMarkerSets))
            self.logger.info(
                '    Marker set inferred from %d reference genomes.' %
                markerSet.numGenomes)

            markerSet.lineageStr = taxon
            binMarkerSets.addMarkerSet(markerSet)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n')
        binMarkerSets.write(fout)
        fout.close()

        return True
Example #7
0
    def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers,
                         bootstrap, bNoLineageSpecificRefinement, bForceDomain,
                         bRequireTaxonomy, resultsParser, minUnique, maxMulti):
        """Determine marker sets for each bin."""

        self.logger.info('  Determining marker sets for each genome bin.')

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get statistics for internal nodes
        uniqueIdToLineageStatistics = self.readNodeMetadata()

        # determine marker set for each bin
        treeFile = os.path.join(outDir, 'storage', 'tree',
                                DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')

        numProcessedBins = 0
        statusStr = ''
        for binId in binIds:
            if self.logger.getEffectiveLevel() <= logging.INFO:
                numProcessedBins += 1
                sys.stderr.write(' ' * len(statusStr) +
                                 '\r')  # clear previous line
                statusStr = '    Finished processing %d of %d (%.2f%%) bins (current: %s).' % (
                    numProcessedBins, len(binIds),
                    float(numProcessedBins) * 100 / len(binIds), binId)
                sys.stderr.write('%s\r' % statusStr)
                sys.stderr.flush()

            node = tree.find_node_with_taxon_label(binId)
            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
            if node == None:
                # bin is not in tree
                node, markerSet = self.__getMarkerSet(
                    rootNode, tree, uniqueIdToLineageStatistics,
                    numGenomesMarkers, bootstrap, bForceDomain,
                    bRequireTaxonomy)
                binMarkerSets.addMarkerSet(markerSet)
            else:
                # special case: if node is on the bacterial or archaeal branch descendant from the root,
                # then move down the tree to include the domain-specific marker set
                parentNode = node.parent_node
                while parentNode != None:
                    if parentNode.label:
                        bRoot = (parentNode.parent_node == None)
                        break

                    parentNode = parentNode.parent_node

                if bRoot:
                    # since the root is the first labeled node, we need to descend the
                    # tree to incorporate the domain-specific marker set
                    domainNode = self.__findDomainNode(node)
                    curNode = domainNode.child_nodes()[0]
                else:
                    curNode = node

                # get lineage specific refinement for first node with an id
                if not bNoLineageSpecificRefinement:
                    uniqueId = parentNode.label.split('|')[0]
                    self.__readLineageSpecificGenesToRemove()
                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[
                        uniqueId]

                # ascend tree to root, recording all marker sets meeting selection criteria
                while curNode.parent_node != None:
                    uniqueHits, multiCopyHits = resultsParser.results[
                        binId].countUniqueHits()
                    tempForceDomain = bForceDomain or (
                        uniqueHits < minUnique) or (multiCopyHits > maxMulti)

                    curNode, markerSet = self.__getMarkerSet(
                        curNode.parent_node, tree, uniqueIdToLineageStatistics,
                        numGenomesMarkers, bootstrap, tempForceDomain,
                        bRequireTaxonomy)

                    if not bNoLineageSpecificRefinement:
                        markerSet = self.__removeInvalidLineageMarkerGenes(
                            markerSet, lineageSpecificRefinement)

                    binMarkerSets.addMarkerSet(markerSet)

            binMarkerSets.write(fout)

        if self.logger.getEffectiveLevel() <= logging.INFO:
            sys.stderr.write('\n')

        fout.close()
Example #8
0
    def getBinMarkerSets(self, outDir, markerFile,
                                    numGenomesMarkers,
                                    bootstrap, bNoLineageSpecificRefinement,
                                    bForceDomain, bRequireTaxonomy,
                                    resultsParser, minUnique, maxMulti):
        """Determine marker sets for each bin."""

        self.logger.info('  Determining marker sets for each genome bin.')

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get statistics for internal nodes
        uniqueIdToLineageStatistics = self.readNodeMetadata()

        # determine marker set for each bin
        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        rootNode = tree.find_node(filter_fn = lambda n: n.parent_node == None)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')

        numProcessedBins = 0
        for binId in binIds:
            if self.logger.getEffectiveLevel() <= logging.INFO:
                numProcessedBins += 1
                statusStr = '    Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, len(binIds), float(numProcessedBins)*100/len(binIds))
                sys.stderr.write('%s\r' % statusStr)
                sys.stderr.flush()
                
            node = tree.find_node_with_taxon_label(binId)
            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
            if node == None:
                # bin is not in tree
                node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics,
                                                        numGenomesMarkers, bootstrap,
                                                        bForceDomain, bRequireTaxonomy)
                binMarkerSets.addMarkerSet(markerSet)
            else:
                # special case: if node is on the bacterial or archaeal branch descendant from the root,
                # then move down the tree to include the domain-specific marker set
                parentNode = node.parent_node
                while parentNode != None:
                    if parentNode.label:
                        bRoot = (parentNode.parent_node == None)
                        break

                    parentNode = parentNode.parent_node

                if bRoot:
                    # since the root is the first labeled node, we need to descend the
                    # tree to incorporate the domain-specific marker set
                    domainNode = self.__findDomainNode(node)
                    curNode = domainNode.child_nodes()[0]
                else:
                    curNode = node
                    
                # get lineage specific refinement for first node with an id
                if not bNoLineageSpecificRefinement:
                    uniqueId = parentNode.label.split('|')[0]
                    self.__readLineageSpecificGenesToRemove()
                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]

                # ascend tree to root, recording all marker sets meeting selection criteria                
                while curNode.parent_node != None:
                    uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
                    tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti)
                    
                    curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics,
                                                                numGenomesMarkers, bootstrap,
                                                                tempForceDomain, bRequireTaxonomy)
                       
                    if not bNoLineageSpecificRefinement:
                        markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
                        
                    binMarkerSets.addMarkerSet(markerSet)

            binMarkerSets.write(fout)

        if self.logger.getEffectiveLevel() <= logging.INFO:
            sys.stderr.write('\n')

        fout.close()