def testDynamicOutgroupsJustLeaves(self): tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False)) mcTree.computeSubtreeRoots() og = DynamicOutgroup() og.importTree(mcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3, sequenceLossWeight=0.) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc7'][0][0] == 'BABOON' og = DynamicOutgroup() og.importTree(mcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # we keep dynamic outgroups sorted by distance too assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values()))
def fillInOutgroups(mcProj, outgroupNames, config, alignmentRootId): """ Determines the outgroups for a MultiCactusProject using the strategy from the config. """ mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=outgroupNames, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.inputSequenceMap, alignmentRootId, candidateSet=outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy())
def testDynamicOutgroupsJustLeaves(self): og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3, sequenceLossWeight=0.) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc7'][0][0] == 'BABOON' og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # we keep dynamic outgroups sorted by distance too assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values()))
def testDynamicOutgroupsOnRandomTrees(self): for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps): degree = max([ len(tree.getChildren(x)) for x in tree.breadthFirstTraversal() ]) if degree < 8: og = DynamicOutgroup() og.edgeLen = 5 og.importTree(tree, seqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. # (this will be true because all sequences are the same) assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values()))
def testDynamicOutgroupsOnRandomTrees(self): for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps): degree = max([len(tree.getChildren(x)) for x in tree.breadthFirstTraversal()]) if degree < 8: og = DynamicOutgroup() og.edgeLen = 5 og.importTree(tree, seqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. # (this will be true because all sequences are the same) assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values()))
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set( [mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj
def testDynamicOutgroupsJustLeaves(self): og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3, sequenceLossWeight=0.) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc7'][0][0] == 'BABOON' og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # we keep dynamic outgroups sorted by distance too assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values()))