def progressiveFunction(self, experimentFile, toilDir,
                            batchSystem, buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(header in headers[genome],
                                            'Header %s from output c2h %s not found in input fa %s'
                                            ' for genome %s' % (header, c2hPath, seqMap[genome], genome))


        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
Example #2
0
    def loadProject(self, mcProject):
        self.inGraph = NX.DiGraph()
        globTree = mcProject.mcTree
        self.maxParallelSubtrees = None
        leafEvents = [globTree.getName(i) for i in globTree.getLeaves()]
        for name, expPath in mcProject.expMap.items():
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            tree = exp.getTree()
            self.inGraph.add_node(name)
            # Go through the species tree and add the correct
            # dependencies (i.e. to the outgroups and the ingroups,
            # but not to the other nodes that are just there because
            # they are needed to form the correct paths).
            for node in tree.postOrderTraversal():
                nodeName = tree.getName(node)
                if not tree.isLeaf(node) and nodeName not in exp.getOutgroupEvents():
                    # This node is just an internal node added while
                    # creating the induced tree from the species
                    # tree. None of the sequence is used, so skip it.
                    continue

                assert tree.hasParent(node)

                if nodeName not in exp.getOutgroupEvents() and tree.getName(tree.getParent(node)) != name:
                    # This leaf isn't an ingroup or an outgroup, it was
                    # just added to make the species tree
                    # binary. (Hopefully this will be unnecessary in
                    # the future.)
                    continue

                # we don't add edges for leaves (in the global tree)
                # as they are input sequences and do not form dependencies
                # (it would be clever to maybe do the same with existing
                # references when --overwrite is not specified but for now
                # we just do the leaves)
                if nodeName not in leafEvents:
                    self.inGraph.add_edge(name, nodeName)
            configElem = ET.parse(exp.getConfig()).getroot()
            conf = ConfigWrapper(configElem)
            # load max parellel subtrees from the node's config
            if self.maxParallelSubtrees is None:
                self.maxParallelSubtrees = conf.getMaxParallelSubtrees()
            else:
                assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees()
        assert NX.is_directed_acyclic_graph(self.inGraph)
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False):

    HALPath = "tmp_alignment.hal"

    # traverse tree to make sure we are going breadth-first
    tree = project.mcTree

    # find subtree if event specified
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in project.expMap:
            experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName])
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID()))
            expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True))
            assert len(expTreeString) > 1
            assert experiment.getHalID() is not None
            assert experiment.getHalFastaID() is not None
            subHALPath = job.fileStore.readGlobalFile(experiment.getHalID())
            halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID())

            args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)]

            if len(outgroups) > 0:
                args += ["--outgroups", ",".join(outgroups)]
            if cacheBytes is not None:
                args += ["--cacheBytes", cacheBytes]
            if cacheMDC is not None:
                args += ["--cacheMDC", cacheMDC]
            if cacheRDC is not None:
                args += ["--cacheRDC", cacheRDC]
            if cacheW0 is not None:
                args += ["--cacheW0", cacheW0]
            if chunk is not None:
                args += ["--chunk", chunk]
            if deflate is not None:
                args += ["--deflate", deflate]
            if inMemory is True:
                args += ["--inMemory"]

            cactus_call(parameters=["halAppendCactusSubtree"] + args)

    cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit])
    with job.fileStore.readGlobalFileStream(project.configID) as configFile:
        cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read())])

    return job.fileStore.writeGlobalFile(HALPath)
Example #4
0
 def testOutgroups(self):
     xmlRoot = self.__makeXmlDummy(self.tree, self.sequences)
     exp = ExperimentWrapper(xmlRoot)
     assert NXNewick().writeString(exp.getTree()) == self.tree
     exp.addOutgroupSequence("outgroup", 1.3, "outgroup.fa")
     exp.addOutgroupSequence("outgroup2", 2.6, "outgroup2.fa")
     assert exp.getOutgroupEvents() == ["outgroup", "outgroup2"]
     seqMap = exp.buildSequenceMap()
     assert "outgroup" in seqMap
     assert seqMap["outgroup"] == "outgroup.fa"
     assert "outgroup2" in seqMap
     assert seqMap["outgroup2"] == "outgroup2.fa"
Example #5
0
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False):

    HALPath = "tmp_alignment.hal"

    # traverse tree to make sure we are going breadth-first
    tree = project.mcTree

    # find subtree if event specified
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in project.expMap:
            experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName])
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID()))
            expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True))
            assert len(expTreeString) > 1
            assert experiment.getHalID() is not None
            assert experiment.getHalFastaID() is not None
            subHALPath = job.fileStore.readGlobalFile(experiment.getHalID())
            halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID())

            args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)]

            if len(outgroups) > 0:
                args += ["--outgroups", ",".join(outgroups)]
            if cacheBytes is not None:
                args += ["--cacheBytes", cacheBytes]
            if cacheMDC is not None:
                args += ["--cacheMDC", cacheMDC]
            if cacheRDC is not None:
                args += ["--cacheRDC", cacheRDC]
            if cacheW0 is not None:
                args += ["--cacheW0", cacheW0]
            if chunk is not None:
                args += ["--chunk", chunk]
            if deflate is not None:
                args += ["--deflate", deflate]
            if inMemory is True:
                args += ["--inMemory"]

            cactus_call(parameters=["halAppendCactusSubtree"] + args)

    return job.fileStore.writeGlobalFile(HALPath)
Example #6
0
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    if not args['append']:
        # Overwrite existing hal
        print 'rm -f {0}'.format(args['HAL_file_path'])
        system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            experiment = ExperimentWrapper(
                ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree())
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(
                experiment.getHALPath(), experiment.getHALFastaPath(),
                expTreeString, args['HAL_file_path'])

            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])
            if args["inMemory"] is True:
                cmdline += " --inMemory"

            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime


#            print "time of above command: {0:.2f}".format(appendTime)

    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(
        totalTime, totalAppendTime)
Example #7
0
    def progressiveFunction(self,
                            experimentFile,
                            toilDir,
                            batchSystem,
                            buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir,
                                          "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(
                map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' %
                                 (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(
                                header in headers[genome],
                                'Header %s from output c2h %s not found in input fa %s'
                                ' for genome %s' %
                                (header, c2hPath, seqMap[genome], genome))

        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
Example #8
0
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    if not args['append']:
        # Overwrite existing hal
        print 'rm -f {0}'.format(args['HAL_file_path'])
        system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            print experimentFilePath
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True))
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path'])
            
            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])
            if args["inMemory"] is True:
                cmdline += " --inMemory"

            
            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime
#            print "time of above command: {0:.2f}".format(appendTime)
 
    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)