def cleanEventTree(experiment):
    tree = MultiCactusTree(experiment.getTree())
    tree.nameUnlabeledInternalNodes()
    for node in tree.breadthFirstTraversal():
        if tree.hasName(node):
            name = tree.getName(node)
            if '.' in name:
                newName = name.replace('.', '_')
                sys.stderr.write('WARNING renaming event %s to %s\n' %(name, newName))
                tree.setName(node, newName)
                name = newName
            parent = tree.getParent(node)
            if parent is not None:
                weight = tree.getWeight(parent, node)
                if weight is None:
                    raise RuntimeError('Missing branch length in species_tree tree')
    redoPrefix = True
    newSuffix = 0
    while redoPrefix is True:
        redoPrefix = False
        for node1 in tree.breadthFirstTraversal():
            name1 = tree.getName(node1)
            for node2 in tree.breadthFirstTraversal():
                name2 = tree.getName(node2)
                if node1 != node2 and name1 == name2:
                    newName = "%s%i" % (name2, newSuffix)
                    newSuffix += 1
                    tree.setName(node2, newName)
                    sys.stderr.write('WARNING renaming event %s to %s\n' % (
                        name2, newName))
                    redoPrefix = True

    experiment.xmlRoot.attrib["species_tree"] = NXNewick().writeString(tree)
    experiment.seqMap = experiment.buildSequenceMap()
Example #2
0
def cleanEventTree(experiment):
    tree = MultiCactusTree(experiment.getTree())
    tree.nameUnlabeledInternalNodes()
    for node in tree.breadthFirstTraversal():
        if tree.hasName(node):
            name = tree.getName(node)
            if '.' in name:
                newName = name.replace('.', '_')
                sys.stderr.write('WARNING renaming event %s to %s\n' %(name, newName))
                tree.setName(node, newName)
                name = newName
            parent = tree.getParent(node)
            if parent is not None:
                weight = tree.getWeight(parent, node)
                if weight is None:
                    raise RuntimeError('Missing branch length in species_tree tree')
    redoPrefix = True
    newSuffix = 0
    while redoPrefix is True:
        redoPrefix = False
        for node1 in tree.breadthFirstTraversal():
            name1 = tree.getName(node1)
            for node2 in tree.breadthFirstTraversal():
                name2 = tree.getName(node2)
                if node1 != node2 and name1 == name2:
                    newName = "%s%i" % (name2, newSuffix)
                    newSuffix += 1
                    tree.setName(node2, newName)
                    sys.stderr.write('WARNING renaming event %s to %s\n' % (
                        name2, newName))
                    redoPrefix = True

    experiment.xmlRoot.attrib["species_tree"] = NXNewick().writeString(tree)
    experiment.seqMap = experiment.buildSequenceMap()
Example #3
0
 def testJustLeaves(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()])
     og.greedy(candidateSet=candidates, candidateChildFrac=2.)
     assert og.ogMap['Anc1'][0][0] == 'HUMAN'
     assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
     assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc5'][0][0] == 'HUMAN'
     assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc7'][0][0] == 'BABOON'
Example #4
0
 def testMultipleOutgroupsJustLeaves(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()])
     og.greedy(candidateSet=candidates, candidateChildFrac=2.,
               maxNumOutgroups=3)
     # make sure all entries have <= 3 outgroups.
     assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
     # and for all entries, the closest must be first.
     assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))
     # ordering is important!
     assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                     'BABOON']
     assert og.ogMap['Anc7'][0][0] == 'BABOON'
     assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']
Example #5
0
def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
            def get_input_path(suffix=''):
                base_path = options.cigarsFile[0]
                for input_path in options.cigarsFile:
                    if suffix and input_path.endswith(suffix):
                        return input_path
                    if os.path.basename(base_path).startswith(
                            os.path.basename(input_path)):
                        base_path = input_path
                return base_path + suffix

            # import the outgroups
            outgroupIDs = []
            outgroup_fragment_found = False
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(get_input_path('.og_fragment_{}'.format(i))))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                    outgroup_fragment_found = True
                    assert not options.pangenome
                except:
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not outgroup_fragment_found
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not outgroup_fragment_found:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            if options.pangenome:
                # turn off the megablock filter as it ruins non-all-to-all alignments
                configWrapper.disableCafMegablockFilter()
                # the recoverable chains parameter does not seem to play nicely with star-like alignments either
                #configWrapper.disableRecoverableChains()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(get_input_path()))
            workFlowArgs.secondaryAlignmentsID = None
            if not options.pafInput:
                try:
                    workFlowArgs.secondaryAlignmentsID = toil.importFile(
                        makeURL(get_input_path('.secondary')))
                except:
                    pass
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if outgroup_fragment_found and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(get_input_path(
                                '.ig_coverage_{}'.format(i)))))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))
Example #6
0
class MultiCactusProject:
    def __init__(self):
        self.mcTree = None
        self.expMap = dict()
        self.expIDMap = None
        self.inputSequences = []
        self.inputSequenceIDs = None
        self.outputSequenceIDMap = None
        self.configID = None

    def readXML(self, path):
        xmlRoot = ET.parse(path).getroot()
        treeElem = xmlRoot.find("tree")
        self.mcTree = MultiCactusTree(NXNewick().parseString(
            treeElem.text, addImpliedRoots=False))
        self.expMap = dict()
        self.expIDMap = dict()
        cactusPathElemList = xmlRoot.findall("cactus")
        for cactusPathElem in cactusPathElemList:
            nameElem = cactusPathElem.attrib["name"]
            pathElem = cactusPathElem.attrib["experiment_path"]
            self.expMap[nameElem] = pathElem
            if "experiment_id" in cactusPathElem.attrib:
                self.expIDMap[nameElem] = cactusPathElem.attrib[
                    "experiment_id"]
        self.inputSequences = xmlRoot.attrib["inputSequences"].split()
        if "inputSequenceIDs" in xmlRoot.attrib:
            self.inputSequenceIDs = xmlRoot.attrib["inputSequenceIDs"].split()
        if "outputSequenceIDs" in xmlRoot.attrib:
            self.outputSequenceIDMap = dict(
                zip(xmlRoot.attrib["outputSequenceIDs"].split(),
                    xmlRoot.attrib["outputSequenceNames"].split()))

        logger.info("xmlRoot = %s" % ET.tostring(xmlRoot))
        if "configID" in xmlRoot.attrib:
            self.configID = xmlRoot.attrib["configID"]

        self.mcTree.assignSubtreeRootNames(self.expMap)

    def writeXML(self, path):
        xmlRoot = ET.Element("multi_cactus")
        treeElem = ET.Element("tree")
        treeElem.text = NXNewick().writeString(self.mcTree)
        xmlRoot.append(treeElem)
        for name, expPath in self.expMap.items():
            cactusPathElem = ET.Element("cactus")
            cactusPathElem.attrib["name"] = name
            cactusPathElem.attrib["experiment_path"] = expPath
            if self.expIDMap:
                cactusPathElem.attrib["experiment_id"] = self.expIDMap[name]
            xmlRoot.append(cactusPathElem)
        #We keep track of all the input sequences at the top level
        xmlRoot.attrib["inputSequences"] = " ".join(self.inputSequences)
        if self.inputSequenceIDs:
            xmlRoot.attrib["inputSequenceIDs"] = " ".join(
                self.inputSequenceIDs)
        if self.outputSequenceIDMap:
            xmlRoot.attrib["outputSequenceIDs"] = " ".join(
                self.outputSequenceIDMap.values())
            xmlRoot.attrib["outputSequenceNames"] = " ".join(
                self.outputSequenceIDMap.keys())
        if self.configID:
            xmlRoot.attrib["configID"] = self.configID

        xmlFile = open(path, "w")
        xmlString = ET.tostring(xmlRoot)
        xmlString = minidom.parseString(xmlString).toprettyxml()
        xmlFile.write(xmlString)
        xmlFile.close()

    def syncToFileStore(self, toil):
        self.expIDMap = dict()
        for name, expPath in self.expMap.items():
            expWrapper = ExperimentWrapper(ET.parse(expPath).getroot())
            expWrapper.setConfigID(
                toil.importFile("file://" + expWrapper.getConfig()))
            if expWrapper.getConstraintsFilePath():
                expWrapper.setConstraintsID(
                    toil.importFile("file://" +
                                    expWrapper.getConstraintsFilePath()))
            expWrapper.writeXML(expPath)
            self.expIDMap[name] = toil.importFile("file://" + expPath)

    def getInputSequenceIDMap(self):
        """Return a map between event names and sequence IDs.
        """
        inputSequenceMap = dict()
        i = 0
        for node in self.mcTree.postOrderTraversal():
            if self.mcTree.isLeaf(node) is True:
                inputSequenceMap[self.mcTree.getName(node)] = \
                  self.inputSequenceIDs[i]
                i += 1
        assert i == len(self.inputSequenceIDs)
        return inputSequenceMap

    def getInputSequenceIDs(self):
        """Get the set of input sequences for the multicactus tree
        """
        return self.inputSequenceIDs

    def getInputSequencePaths(self):
        return self.inputSequences

    def setOutputSequenceIDs(self, outputSequenceIDs):
        self.outputSequenceIDMap = dict()
        i = 0
        for node in self.mcTree.postOrderTraversal():
            if self.mcTree.isLeaf(node) is True:
                self.outputSequenceIDMap[self.mcTree.getName(node)] = \
                  outputSequenceIDs[i]
                i += 1
        assert i == len(outputSequenceIDs)

    def getOutputSequenceIDMap(self):
        return self.outputSequenceIDMap

    def getConfigPath(self):
        return ExperimentWrapper(ET.parse(
            self.expMap.values()[0]).getroot()).getConfigPath()

    def setConfigID(self, configID):
        self.configID = configID

    def getConfigID(self):
        return self.configID

    def setInputSequenceIDs(self, inputSequenceIDs):
        self.inputSequenceIDs = inputSequenceIDs
Example #7
0
class TestCase(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree)
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(
            borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(
            backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event + ".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        system("rm -rf %s" % self.tempDir)

    def testJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(
            [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.)
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
        assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc5'][0][0] == 'HUMAN'
        assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

    def testHeightTable(self):
        """Make sure the height-table is calculated correctly."""
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        htable = og.heightTable()
        self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4)

    def testZeroThreshold(self):
        """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule."""
        tree = self.backboneTree
        og = GreedyOutgroup()
        og.importTree(tree)
        og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),
                  threshold=0,
                  maxNumOutgroups=3,
                  candidateChildFrac=0.75)
        og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        htable = og.heightTable()
        for node, outgroups in og.ogMap.items():
            for outgroup, _ in outgroups:
                # For the outgroup assignment to create no
                # additional dependencies, each outgroup must have
                # a height lower than the node it's outgroup to
                # (or be a leaf)
                self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \
                                or htable[tree.getNodeId(outgroup)] == 0)

    def testCandidates(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'

    def testGeneralBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testGeneralConstrainedBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None, threshold=2)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testMultipleOutgroups(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
        assert map(itemgetter(0),
                   og.ogMap['Anc7']) == ['BABOON', 'Anc1', 'Anc5']
        # We avoid cycles, and choose post-order first, so this only
        # uses leaves.
        assert map(itemgetter(0),
                   og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']

    def testMultipleOutgroupsJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(
            [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates,
                  candidateChildFrac=2.,
                  maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0),
                   og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'
        assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

    def testMultipleOutgroupsOnRandomTrees(self):
        for tree in self.mcTrees:
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))

    def testDynamicOutgroupsOnRandomTrees(self):
        for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
            degree = max([
                len(tree.getChildren(x)) for x in tree.breadthFirstTraversal()
            ])
            if degree < 8:
                og = DynamicOutgroup()
                og.edgeLen = 5
                og.importTree(tree, seqMap)
                og.compute(maxNumOutgroups=3)
                # make sure all entries have <= 3 outgroups.
                assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
                # and for all entries, the closest must be first.
                # (this will be true because all sequences are the same)
                assert all(
                    map(lambda x: x == sorted(x, key=itemgetter(1)),
                        og.ogMap.values()))

    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))

    def testMultipleIdenticalRunsProduceSameResult(self):
        """The code now allows for multiple greedy() calls with different
        candidate sets, so that some outgroups can be 'preferred' over
        others without being the only candidates.
        Check that running greedy() multiple times with the same
        parameters gives the same result as running it once.
        """
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            ogOnce.greedy(maxNumOutgroups=3)
            ogMultipleTimes = GreedyOutgroup()
            ogMultipleTimes.importTree(tree)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(
                map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    ogMultipleTimes.ogMap.values()))
            # Check that the maps are equal. Can't compare them
            # directly since python will convert them to ordered
            # association lists.
            assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap)
            for i in ogOnce.ogMap:
                assert i in ogMultipleTimes.ogMap
                assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i]

    def testPreferredCandidateSets(self):
        """Test that running greedy() multiple times with different candidate
        sets will behave properly, i.e. keep all the existing outgroup
        assignments and fill in more on the second run."""
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            nodes = [j for j in tree.postOrderTraversal()]
            candidateSet = set([
                tree.getName(i)
                for i in random.sample(nodes, min(20, len(nodes)))
            ])
            ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice = GreedyOutgroup()
            ogTwice.importTree(tree)
            ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    ogTwice.ogMap.values()))
            for node in ogTwice.ogMap:
                if node in ogOnce.ogMap:
                    # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice
                    oneRunOutgroups = ogOnce.ogMap[node]
                    twoRunOutgroups = ogTwice.ogMap[node]
                    assert len(twoRunOutgroups) >= len(oneRunOutgroups)
                    for i in oneRunOutgroups:
                        assert i in twoRunOutgroups

    def testNoOutgroupIsADescendantOfAnother(self):
        """No two outgroups should be on the same path to the root."""
        for tree in self.mcTrees:
            tree.nameUnlabeledInternalNodes()
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(maxNumOutgroups=3)
            for source in og.ogMap:
                for (sink1, _) in og.ogMap[source]:
                    for (sink2, _) in og.ogMap[source]:
                        if sink1 != sink2:
                            sink1Id = tree.nameToId[sink1]
                            sink2Id = tree.nameToId[sink2]
                            assert sink1Id not in tree.postOrderTraversal(
                                sink2Id)
                            assert sink2Id not in tree.postOrderTraversal(
                                sink1Id)
Example #8
0
def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job
Example #9
0
class TestCase(unittest.TestCase):

    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree, tree.degree())
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event +".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        system("rm -rf %s" % self.tempDir)

    def testJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.)
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
        assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc5'][0][0] == 'HUMAN'
        assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

    def testHeightTable(self):
        """Make sure the height-table is calculated correctly."""
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        htable = og.heightTable()
        self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4)

    def testZeroThreshold(self):
        """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule."""
        tree = self.backboneTree
        og = GreedyOutgroup()
        og.importTree(tree)
        og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        htable = og.heightTable()
        for node, outgroups in og.ogMap.items():
            for outgroup, _ in outgroups:
                # For the outgroup assignment to create no
                # additional dependencies, each outgroup must have
                # a height lower than the node it's outgroup to
                # (or be a leaf)
                self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \
                                or htable[tree.getNodeId(outgroup)] == 0)

    def testCandidates(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'

    def testGeneralBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testGeneralConstrainedBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None, threshold=2)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testMultipleOutgroups(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
        assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1',
                                                        'Anc5']
        # We avoid cycles, and choose post-order first, so this only
        # uses leaves.
        assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                        'BABOON']

    def testMultipleOutgroupsJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.,
                  maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                        'BABOON']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'
        assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

    def testMultipleOutgroupsOnRandomTrees(self):
        for tree in self.mcTrees:
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           og.ogMap.values()))

    def testDynamicOutgroupsOnRandomTrees(self):
        for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
            degree = max([len(tree.getChildren(x)) for x in
                         tree.breadthFirstTraversal()])
            if degree < 8:
                og = DynamicOutgroup()
                og.edgeLen = 5
                og.importTree(tree, seqMap)
                og.compute(maxNumOutgroups=3)
                # make sure all entries have <= 3 outgroups.
                assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
                # and for all entries, the closest must be first.
                # (this will be true because all sequences are the same)
                assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))

    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))
                        

    def testMultipleIdenticalRunsProduceSameResult(self):
        """The code now allows for multiple greedy() calls with different
        candidate sets, so that some outgroups can be 'preferred' over
        others without being the only candidates.
        Check that running greedy() multiple times with the same
        parameters gives the same result as running it once.
        """
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            ogOnce.greedy(maxNumOutgroups=3)
            ogMultipleTimes = GreedyOutgroup()
            ogMultipleTimes.importTree(tree)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           ogMultipleTimes.ogMap.values()))
            # Check that the maps are equal. Can't compare them
            # directly since python will convert them to ordered
            # association lists.
            assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap)
            for i in ogOnce.ogMap:
                assert i in ogMultipleTimes.ogMap
                assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i]

    def testPreferredCandidateSets(self):
        """Test that running greedy() multiple times with different candidate
        sets will behave properly, i.e. keep all the existing outgroup
        assignments and fill in more on the second run."""
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            nodes = [j for j in tree.postOrderTraversal()]
            candidateSet = set([tree.getName(i) for i in random.sample(nodes, min(20, len(nodes)))])
            ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice = GreedyOutgroup()
            ogTwice.importTree(tree)
            ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           ogTwice.ogMap.values()))
            for node in ogTwice.ogMap:
                if node in ogOnce.ogMap:
                    # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice
                    oneRunOutgroups = ogOnce.ogMap[node]
                    twoRunOutgroups = ogTwice.ogMap[node]
                    assert len(twoRunOutgroups) >= len(oneRunOutgroups)
                    for i in oneRunOutgroups:
                        assert i in twoRunOutgroups

    def testNoOutgroupIsADescendantOfAnother(self):
        """No two outgroups should be on the same path to the root."""
        for tree in self.mcTrees:
            tree.nameUnlabeledInternalNodes()
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(maxNumOutgroups=3)
            for source in og.ogMap:
                for (sink1, _) in og.ogMap[source]:
                    for (sink2, _) in og.ogMap[source]:
                        if sink1 != sink2:
                            sink1Id = tree.nameToId[sink1]
                            sink2Id = tree.nameToId[sink2]
                            assert sink1Id not in tree.postOrderTraversal(sink2Id)
                            assert sink2Id not in tree.postOrderTraversal(sink1Id)
Example #10
0
def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(options,
                                         options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # import the outgroups
            outgroupIDs = []
            cactus_blast_input = not options.nonBlastInput
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(options.blastOutput) +
                        '.og_fragment_{}'.format(i))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                except:
                    if cactus_blast_input:
                        raise
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not cactus_blast_input
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not cactus_blast_input:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(options.blastOutput))
            try:
                workFlowArgs.secondaryAlignmentsID = toil.importFile(
                    makeURL(options.blastOutput) + '.secondary')
            except:
                workFlowArgs.secondaryAlignmentsID = None
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if cactus_blast_input and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(options.blastOutput) +
                            '.ig_coverage_{}'.format(i)))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs,
                              project, cactus_blast_input))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))
Example #11
0
class MultiCactusProject:
    def __init__(self):
        self.mcTree = None
        self.expMap = dict()
        self.inputSequences = []
        self.outputSequenceDir = None

    def readXML(self, path):
        xmlRoot = ET.parse(path).getroot()
        treeElem = xmlRoot.find("tree")
        self.mcTree = MultiCactusTree(NXNewick().parseString(treeElem.text, addImpliedRoots=False))
        self.expMap = dict()
        cactusPathElemList = xmlRoot.findall("cactus")
        for cactusPathElem in cactusPathElemList:
            nameElem = cactusPathElem.attrib["name"]
            pathElem = cactusPathElem.attrib["experiment_path"]
            self.expMap[nameElem] = pathElem
        self.inputSequences = xmlRoot.attrib["inputSequences"].split()
        self.outputSequenceDir = xmlRoot.attrib["outputSequenceDir"]
        self.mcTree.assignSubtreeRootNames(self.expMap)

    def writeXML(self, path):
        xmlRoot = ET.Element("multi_cactus")
        treeElem = ET.Element("tree")
        treeElem.text = NXNewick().writeString(self.mcTree)
        xmlRoot.append(treeElem)
        for name, expPath in self.expMap.items():
            cactusPathElem = ET.Element("cactus")
            cactusPathElem.attrib["name"] = name
            cactusPathElem.attrib["experiment_path"] = expPath
            xmlRoot.append(cactusPathElem)
        # We keep track of all the input sequences at the top level
        xmlRoot.attrib["inputSequences"] = " ".join(self.inputSequences)
        xmlRoot.attrib["outputSequenceDir"] = self.outputSequenceDir
        xmlFile = open(path, "w")
        xmlString = ET.tostring(xmlRoot)
        xmlString = minidom.parseString(xmlString).toprettyxml()
        xmlFile.write(xmlString)
        xmlFile.close()

    # find the sequence associated with an event name
    # by digging out the appropriate experiment file
    # doesn't work for the root!!!!
    def sequencePath(self, eventName):
        parentEvent = self.mcTree.getSubtreeRoot(eventName)
        expPath = self.expMap[parentEvent]
        expElem = ET.parse(expPath).getroot()
        exp = ExperimentWrapper(expElem)
        seq = exp.getSequence(eventName)
        assert os.path.isfile(seq)
        return seq

    def getInputSequenceMap(self):
        """Return a map between event names and sequence paths.  Paths
        are different from above in that they are not taken from experiment
        xmls, but rather from directly from the project xml.
        """
        inputSequenceMap = dict()
        i = 0
        for node in self.mcTree.postOrderTraversal():
            if self.mcTree.isLeaf(node) is True:
                inputSequenceMap[self.mcTree.getName(node)] = self.inputSequences[i]
                i += 1
        assert i == len(self.inputSequences)
        return inputSequenceMap

    def getInputSequencePaths(self):
        """Get the set of input sequences for the multicactus tree
        """
        return self.inputSequences

    def getOutputSequenceDir(self):
        """The directory where the output sequences go
        """
        return self.outputSequenceDir

    def getConfigPath(self):
        return ExperimentWrapper(ET.parse(self.expMap.values()[0]).getroot()).getConfigPath()