Example #1
0
    def testDynamicOutgroupsJustLeaves(self):
        tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
        mcTree.computeSubtreeRoots()
        og = DynamicOutgroup()
        og.importTree(mcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(mcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))
def fillInOutgroups(mcProj, outgroupNames, config, alignmentRootId):
    """
    Determines the outgroups for a MultiCactusProject using the strategy from the config.
    """
    mcProj.outgroup = None
    if config.getOutgroupStrategy() == 'greedy':
        # use the provided outgroup candidates, or use all outgroups
        # as candidates if none are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=outgroupNames,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyLeaves':
        # use all leaves as outgroups, unless outgroup candidates are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=outgroupNames,
                               candidateChildFrac=2.0,
                               maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyPreference':
        # prefer the provided outgroup candidates, if any, but use
        # other nodes as "filler" if we can't find enough.
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=outgroupNames,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=None,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'dynamic':
        # dynamic programming algorithm that exactly optimizes probability
        # that base in target node aligns to at least one base in the
        # outgroup set.  Caveats are that it only returns leaves, and
        # the model used for optimization is super naive. Still, it does
        # some things better than greedy approaches such as properly account
        # for phylogenetic redundancy, as well as try to factor assembly
        # size/quality automatically.
        mcProj.outgroup = DynamicOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree,
                                   mcProj.inputSequenceMap,
                                   alignmentRootId,
                                   candidateSet=outgroupNames)
        mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() != 'none':
        raise RuntimeError("Could not understand outgroup strategy %s" %
                           config.getOutgroupStrategy())
Example #3
0
    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))
Example #4
0
 def testDynamicOutgroupsOnRandomTrees(self):
     for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
         degree = max([
             len(tree.getChildren(x)) for x in tree.breadthFirstTraversal()
         ])
         if degree < 8:
             og = DynamicOutgroup()
             og.edgeLen = 5
             og.importTree(tree, seqMap)
             og.compute(maxNumOutgroups=3)
             # make sure all entries have <= 3 outgroups.
             assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
             # and for all entries, the closest must be first.
             # (this will be true because all sequences are the same)
             assert all(
                 map(lambda x: x == sorted(x, key=itemgetter(1)),
                     og.ogMap.values()))
Example #5
0
 def testDynamicOutgroupsOnRandomTrees(self):
     for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
         degree = max([len(tree.getChildren(x)) for x in
                      tree.breadthFirstTraversal()])
         if degree < 8:
             og = DynamicOutgroup()
             og.edgeLen = 5
             og.importTree(tree, seqMap)
             og.compute(maxNumOutgroups=3)
             # make sure all entries have <= 3 outgroups.
             assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
             # and for all entries, the closest must be first.
             # (this will be true because all sequences are the same)
             assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                            og.ogMap.values()))
def createMCProject(tree, experiment, config, options):
    mcTree = MultiCactusTree(tree, config.getSubtreeSize())
    mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix())
    mcTree.computeSubtreeRoots()
    mcProj = MultiCactusProject()
    mcProj.mcTree = mcTree
    mcProj.inputSequences = experiment.getSequences()[:]
    if config.getDoSelfAlignment():
        mcTree.addSelfEdges()
    for name in mcProj.mcTree.getSubtreeRootNames():
        expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name)
        mcProj.expMap[name] = os.path.abspath(expPath)
    alignmentRootId = mcProj.mcTree.getRootId()
    if options.root is not None:
        try:
            alignmentRootId = mcProj.mcTree.getNodeId(options.root)
        except:
            raise RuntimeError("Specified root name %s not found in tree" %
                               options.root)
    mcProj.outgroup = None
    if config.getOutgroupStrategy() == 'greedy':
        # use the provided outgroup candidates, or use all outgroups
        # as candidates if none are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=options.outgroupNames,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyLeaves':
        # use all leaves as outgroups, unless outgroup candidates are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        ogSet = options.outgroupNames
        if ogSet is None:
            ogSet = set(
                [mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()])
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=ogSet,
                               candidateChildFrac=2.0,
                               maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyPreference':
        # prefer the provided outgroup candidates, if any, but use
        # other nodes as "filler" if we can't find enough.
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=options.outgroupNames,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=None,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'dynamic':
        # dynamic programming algorithm that exactly optimizes probability
        # that base in target node aligns to at least one base in the
        # outgroup set.  Caveats are that it only returns leaves, and
        # the model used for optimization is super naive. Still, it does
        # some things better than greedy approaches such as properly account
        # for phylogenetic redundancy, as well as try to factor assembly
        # size/quality automatically.
        mcProj.outgroup = DynamicOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree,
                                   mcProj.getInputSequenceMap(),
                                   alignmentRootId,
                                   candidateSet=options.outgroupNames)
        mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() != 'none':
        raise RuntimeError("Could not understand outgroup strategy %s" %
                           config.getOutgroupStrategy())

    # if necessary, we reroot the tree at the specified alignment root id.  all leaf genomes
    # that are no longer in the tree, but still used as outgroups, are moved into special fields
    # so that we can remember to, say, get their paths for preprocessing.
    specifyAlignmentRoot(mcProj, alignmentRootId)
    return mcProj
Example #7
0
    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))