def createMCProject(tree, experiment, config, options): """ Creates a properly initialized MultiCactusProject. TODO: This should really all be in the constructor for MultiCactusProject. """ mcTree = MultiCactusTree(tree) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() for genome in experiment.getGenomesWithSequence(): mcProj.inputSequenceMap[genome] = experiment.getSequenceID(genome) mcProj.mcTree = mcTree if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) fillInOutgroups(mcProj, options.outgroupNames, config, alignmentRootId) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, experiment, alignmentRootId) return mcProj
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set( [mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] mcProj.outputSequenceDir = experiment.getOutputSequenceDir() if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except Exception as e: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set([mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj