Example #1
0
    def markerSet(self, rank, taxon, markerFile):
        """Obtain specified taxonomic-specific marker set."""

        taxonMarkerSets = self.readMarkerSets()

        if rank not in taxonMarkerSets:
            self.logger.error('  Unrecognized taxonomic rank: ' + rank)
            return False
        elif taxon not in taxonMarkerSets[rank]:
            self.logger.error('  Unrecognized taxon: %s (in rank %s): ' %
                              (taxon, rank))
            return False

        markerSet = taxonMarkerSets[rank][taxon]

        taxonomy = markerSet.lineageStr.split(';')[::-1]
        binMarkerSets = BinMarkerSets(taxon,
                                      BinMarkerSets.TAXONOMIC_MARKER_SET)
        for i, taxon in enumerate(taxonomy):
            if rank != 'life':
                rank = ranksByLevel[len(taxonomy) - i - 1]

            if rank == 'species':
                taxon = taxonomy[1] + ' ' + taxonomy[0]

            markerSet = taxonMarkerSets[rank][taxon]
            numMarkers, numMarkerSets = markerSet.size()
            self.logger.info(
                '  Marker set for %s contains %d marker genes arranged in %d sets.'
                % (taxon, numMarkers, numMarkerSets))
            self.logger.info(
                '    Marker set inferred from %d reference genomes.' %
                markerSet.numGenomes)

            markerSet.lineageStr = taxon
            binMarkerSets.addMarkerSet(markerSet)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n')
        binMarkerSets.write(fout)
        fout.close()

        return True
Example #2
0
    def markerSet(self, rank, taxon, markerFile):
        """Obtain specified taxonomic-specific marker set."""

        taxonMarkerSets = self.readMarkerSets()


        if rank not in taxonMarkerSets:
            self.logger.error('  Unrecognized taxonomic rank: ' + rank)
            return False
        elif taxon not in taxonMarkerSets[rank]:
            self.logger.error('  Unrecognized taxon: %s (in rank %s): ' % (taxon, rank))
            return False

        markerSet = taxonMarkerSets[rank][taxon]

        taxonomy = markerSet.lineageStr.split(';')[::-1]
        binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET)
        for i, taxon in enumerate(taxonomy):
            if rank != 'life':
                rank = ranksByLevel[len(taxonomy)-i-1]

            if rank == 'species':
                taxon = taxonomy[1] + ' ' + taxonomy[0]

            markerSet = taxonMarkerSets[rank][taxon]
            numMarkers, numMarkerSets = markerSet.size()
            self.logger.info('  Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets))
            self.logger.info('    Marker set inferred from %d reference genomes.' % markerSet.numGenomes)

            markerSet.lineageStr = taxon
            binMarkerSets.addMarkerSet(markerSet)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n')
        binMarkerSets.write(fout)
        fout.close()

        return True
Example #3
0
    def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers,
                         bootstrap, bNoLineageSpecificRefinement, bForceDomain,
                         bRequireTaxonomy, resultsParser, minUnique, maxMulti):
        """Determine marker sets for each bin."""

        self.logger.info('  Determining marker sets for each genome bin.')

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get statistics for internal nodes
        uniqueIdToLineageStatistics = self.readNodeMetadata()

        # determine marker set for each bin
        treeFile = os.path.join(outDir, 'storage', 'tree',
                                DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')

        numProcessedBins = 0
        statusStr = ''
        for binId in binIds:
            if self.logger.getEffectiveLevel() <= logging.INFO:
                numProcessedBins += 1
                sys.stderr.write(' ' * len(statusStr) +
                                 '\r')  # clear previous line
                statusStr = '    Finished processing %d of %d (%.2f%%) bins (current: %s).' % (
                    numProcessedBins, len(binIds),
                    float(numProcessedBins) * 100 / len(binIds), binId)
                sys.stderr.write('%s\r' % statusStr)
                sys.stderr.flush()

            node = tree.find_node_with_taxon_label(binId)
            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
            if node == None:
                # bin is not in tree
                node, markerSet = self.__getMarkerSet(
                    rootNode, tree, uniqueIdToLineageStatistics,
                    numGenomesMarkers, bootstrap, bForceDomain,
                    bRequireTaxonomy)
                binMarkerSets.addMarkerSet(markerSet)
            else:
                # special case: if node is on the bacterial or archaeal branch descendant from the root,
                # then move down the tree to include the domain-specific marker set
                parentNode = node.parent_node
                while parentNode != None:
                    if parentNode.label:
                        bRoot = (parentNode.parent_node == None)
                        break

                    parentNode = parentNode.parent_node

                if bRoot:
                    # since the root is the first labeled node, we need to descend the
                    # tree to incorporate the domain-specific marker set
                    domainNode = self.__findDomainNode(node)
                    curNode = domainNode.child_nodes()[0]
                else:
                    curNode = node

                # get lineage specific refinement for first node with an id
                if not bNoLineageSpecificRefinement:
                    uniqueId = parentNode.label.split('|')[0]
                    self.__readLineageSpecificGenesToRemove()
                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[
                        uniqueId]

                # ascend tree to root, recording all marker sets meeting selection criteria
                while curNode.parent_node != None:
                    uniqueHits, multiCopyHits = resultsParser.results[
                        binId].countUniqueHits()
                    tempForceDomain = bForceDomain or (
                        uniqueHits < minUnique) or (multiCopyHits > maxMulti)

                    curNode, markerSet = self.__getMarkerSet(
                        curNode.parent_node, tree, uniqueIdToLineageStatistics,
                        numGenomesMarkers, bootstrap, tempForceDomain,
                        bRequireTaxonomy)

                    if not bNoLineageSpecificRefinement:
                        markerSet = self.__removeInvalidLineageMarkerGenes(
                            markerSet, lineageSpecificRefinement)

                    binMarkerSets.addMarkerSet(markerSet)

            binMarkerSets.write(fout)

        if self.logger.getEffectiveLevel() <= logging.INFO:
            sys.stderr.write('\n')

        fout.close()
Example #4
0
    def getBinMarkerSets(self, outDir, markerFile,
                                    numGenomesMarkers,
                                    bootstrap, bNoLineageSpecificRefinement,
                                    bForceDomain, bRequireTaxonomy,
                                    resultsParser, minUnique, maxMulti):
        """Determine marker sets for each bin."""

        self.logger.info('  Determining marker sets for each genome bin.')

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get statistics for internal nodes
        uniqueIdToLineageStatistics = self.readNodeMetadata()

        # determine marker set for each bin
        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        rootNode = tree.find_node(filter_fn = lambda n: n.parent_node == None)

        fout = open(markerFile, 'w')
        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')

        numProcessedBins = 0
        for binId in binIds:
            if self.logger.getEffectiveLevel() <= logging.INFO:
                numProcessedBins += 1
                statusStr = '    Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, len(binIds), float(numProcessedBins)*100/len(binIds))
                sys.stderr.write('%s\r' % statusStr)
                sys.stderr.flush()
                
            node = tree.find_node_with_taxon_label(binId)
            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
            if node == None:
                # bin is not in tree
                node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics,
                                                        numGenomesMarkers, bootstrap,
                                                        bForceDomain, bRequireTaxonomy)
                binMarkerSets.addMarkerSet(markerSet)
            else:
                # special case: if node is on the bacterial or archaeal branch descendant from the root,
                # then move down the tree to include the domain-specific marker set
                parentNode = node.parent_node
                while parentNode != None:
                    if parentNode.label:
                        bRoot = (parentNode.parent_node == None)
                        break

                    parentNode = parentNode.parent_node

                if bRoot:
                    # since the root is the first labeled node, we need to descend the
                    # tree to incorporate the domain-specific marker set
                    domainNode = self.__findDomainNode(node)
                    curNode = domainNode.child_nodes()[0]
                else:
                    curNode = node
                    
                # get lineage specific refinement for first node with an id
                if not bNoLineageSpecificRefinement:
                    uniqueId = parentNode.label.split('|')[0]
                    self.__readLineageSpecificGenesToRemove()
                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]

                # ascend tree to root, recording all marker sets meeting selection criteria                
                while curNode.parent_node != None:
                    uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
                    tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti)
                    
                    curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics,
                                                                numGenomesMarkers, bootstrap,
                                                                tempForceDomain, bRequireTaxonomy)
                       
                    if not bNoLineageSpecificRefinement:
                        markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
                        
                    binMarkerSets.addMarkerSet(markerSet)

            binMarkerSets.write(fout)

        if self.logger.getEffectiveLevel() <= logging.INFO:
            sys.stderr.write('\n')

        fout.close()