Example #1
0
 def _buildExecutor(self):
     """
     Creates and returns an ExecutorInfo instance representing our executor implementation.
     """
     # The executor program is installed as a setuptools entry point by setup.py
     info = mesos_pb2.ExecutorInfo()
     info.name = "toil"
     info.command.value = resolveEntryPoint('_toil_mesos_executor')
     info.executor_id.value = "toil-%i" % os.getpid()
     info.source = pwd.getpwuid(os.getuid()).pw_name
     return info
Example #2
0
 def issueJob(self, jobStoreID, memory, cores, disk, preemptable):
     """
     Add a job to the queue of jobs
     """
     self.jobsIssued += 1
     jobCommand = ' '.join((resolveEntryPoint('_toil_worker'), self.jobStoreString, jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobCommand, memory, cores, disk, preemptable)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = IssuedJob(jobStoreID, memory, cores, disk, preemptable)
     logger.debug("Issued job with job store ID: %s and job batch system ID: "
                  "%s and cores: %.2f, disk: %.2f, and memory: %.2f",
                  jobStoreID, str(jobBatchSystemID), cores, disk, memory)
Example #3
0
 def _buildExecutor(self):
     """
     Creates and returns an ExecutorInfo instance representing our executor implementation.
     """
     # The executor program is installed as a setuptools entry point by setup.py
     info = mesos_pb2.ExecutorInfo()
     info.name = "toil"
     info.command.value = resolveEntryPoint('_toil_mesos_executor')
     info.executor_id.value = "toil-%i" % os.getpid()
     info.source = pwd.getpwuid(os.getuid()).pw_name
     return info
Example #4
0
 def issueJob(self, jobStoreID, memory, cores, disk, preemptable):
     """
     Add a job to the queue of jobs
     """
     self.jobsIssued += 1
     jobCommand = ' '.join((resolveEntryPoint('_toil_worker'), self.jobStoreLocator, jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobCommand, memory, cores, disk, preemptable)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = IssuedJob(jobStoreID, memory, cores, disk, preemptable)
     logger.debug("Issued job with job store ID: %s and job batch system ID: "
                  "%s and cores: %.2f, disk: %.2f, and memory: %.2f",
                  jobStoreID, str(jobBatchSystemID), cores, disk, memory)
Example #5
0
File: leader.py Project: arkal/toil
 def issueJob(self, jobStoreID, memory, cores, disk):
     """
     Add a job to the queue of jobs
     """
     self.jobsIssued += 1
     jobCommand = ' '.join((resolveEntryPoint('_toil_worker'), self.jobStoreString, jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobCommand, memory, cores, disk)
     self.jobBatchSystemIDToJobStoreIDHash[jobBatchSystemID] = jobStoreID
     logger.debug("Issued job with job store ID: %s and job batch system ID: "
                  "%s and cores: %i, disk: %i, and memory: %i",
                  jobStoreID, str(jobBatchSystemID), cores, disk, memory)
Example #6
0
 def issueJob(self, issuedJob):
     """
     Add a job to the queue of jobs. 
     """
     jobCommand = ' '.join((resolveEntryPoint('_toil_worker'), 
                            self.config.jobStore, issuedJob.jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobCommand, issuedJob.memory, 
                             issuedJob.cores, issuedJob.disk, issuedJob.preemptable)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = issuedJob
     logger.debug("Issued job with job store ID: %s and job batch system ID: "
                  "%s and cores: %i, disk: %i, and memory: %i",
                  issuedJob.jobStoreID, str(jobBatchSystemID), issuedJob.cores, 
                  issuedJob.disk, issuedJob.memory)
Example #7
0
 def issueJob(self, issuedJob):
     """
     Add a job to the queue of jobs. 
     """
     jobCommand = ' '.join((resolveEntryPoint('_toil_worker'), 
                            self.config.jobStore, issuedJob.jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobCommand, issuedJob.memory, 
                             issuedJob.cores, issuedJob.disk, issuedJob.preemptable)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = issuedJob
     logger.debug("Issued job with job store ID: %s and job batch system ID: "
                  "%s and cores: %i, disk: %i, and memory: %i",
                  issuedJob.jobStoreID, str(jobBatchSystemID), issuedJob.cores, 
                  issuedJob.disk, issuedJob.memory)
Example #8
0
 def issueJob(self, jobNode):
     """
     Add a job to the queue of jobs
     """
     if jobNode.preemptable:
         self.preemptableJobsIssued += 1
     jobNode.command = ' '.join((resolveEntryPoint('_toil_worker'),
                                 self.jobStoreLocator, jobNode.jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = jobNode
     logger.debug("Issued job with job store ID: %s and job batch system ID: "
                  "%s and cores: %.2f, disk: %.2f, and memory: %.2f",
                  jobNode.jobStoreID, str(jobBatchSystemID), jobNode.cores,
                  jobNode.disk, jobNode.memory)
Example #9
0
 def issueJob(self, jobNode):
     """
     Add a job to the queue of jobs
     """
     jobNode.command = ' '.join((resolveEntryPoint('_toil_worker'),
                                 self.jobStoreLocator, jobNode.jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = jobNode
     if jobNode.preemptable:
         # len(jobBatchSystemIDToIssuedJob) should always be greater than or equal to preemptableJobsIssued,
         # so increment this value after the job is added to the issuedJob dict
         self.preemptableJobsIssued += 1
     cur_logger = (logger.debug if jobNode.jobName.startswith(self.debugJobNames)
                   else logger.info)
     cur_logger("Issued job %s with job batch system ID: "
                "%s and cores: %s, disk: %s, and memory: %s",
                jobNode, str(jobBatchSystemID), int(jobNode.cores),
                bytes2human(jobNode.disk), bytes2human(jobNode.memory))
Example #10
0
 def issueJob(self, jobNode):
     """
     Add a job to the queue of jobs
     """
     jobNode.command = ' '.join((resolveEntryPoint('_toil_worker'),
                                 self.jobStoreLocator, jobNode.jobStoreID))
     jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode)
     self.jobBatchSystemIDToIssuedJob[jobBatchSystemID] = jobNode
     if jobNode.preemptable:
         # len(jobBatchSystemIDToIssuedJob) should always be greater than or equal to preemptableJobsIssued,
         # so increment this value after the job is added to the issuedJob dict
         self.preemptableJobsIssued += 1
     cur_logger = (logger.debug if jobNode.jobName.startswith(CWL_INTERNAL_JOBS)
                   else logger.info)
     cur_logger("Issued job %s with job batch system ID: "
                "%s and cores: %s, disk: %s, and memory: %s",
                jobNode, str(jobBatchSystemID), int(jobNode.cores),
                bytes2human(jobNode.disk), bytes2human(jobNode.memory))
Example #11
0
 def toilMain(self):
     return resolveEntryPoint("toil")
Example #12
0
    def _toilSort(self, jobStore, batchSystem,
                  lines=defaultLines, N=defaultN, testNo=1, lineLen=defaultLineLen):
        """
        Generate a file consisting of the given number of random lines, each line of the given
        length. Sort the file with Toil by splitting the file recursively until each part is less
        than the given number of bytes, sorting each part and merging them back together. Then
        verify the result.

        :param jobStore: a job store string

        :param batchSystem: the name of the batch system

        :param lines: the number of random lines to generate

        :param N: the size in bytes of each split

        :param testNo: the number of repeats of this test

        :param lineLen: the length of each random line in the file
        """
        for test in xrange(testNo):
            try:
                # Specify options
                options = Job.Runner.getDefaultOptions(jobStore)
                options.logLevel = getLogLevelString()
                options.retryCount = 2
                options.batchSystem = batchSystem
                options.clean = "never"
                options.badWorker = 0.5
                options.badWorkerFailInterval = 0.05

                # Make the file to sort
                tempSortFile = os.path.join(self.tempDir, "fileToSort.txt")
                makeFileToSort(tempSortFile, lines=lines, lineLen=lineLen)

                # First make our own sorted version
                with open(tempSortFile, 'r') as fileHandle:
                    l = fileHandle.readlines()
                    l.sort()

                # Make the first job
                firstJob = Job.wrapJobFn(setup, tempSortFile, N, memory=sortMemory)

                # Check we get an exception if we try to restart a workflow that doesn't exist
                options.restart = True
                try:
                    Job.Runner.startToil(firstJob, options)
                    self.fail()
                except JobStoreCreationException:
                    pass

                options.restart = False

                # Now actually run the workflow
                try:
                    Job.Runner.startToil(firstJob, options)
                    i = 0
                except FailedJobsException as e:
                    i = e.numberOfFailedJobs

                # Check we get an exception if we try to run without restart on an existing store
                try:
                    Job.Runner.startToil(firstJob, options)
                    self.fail()
                except JobStoreCreationException:
                    pass

                options.restart = True

                # This loop tests the restart behavior
                totalTrys = 1
                while i != 0:
                    options.useExistingOptions = random.random() > 0.5
                    try:
                        Job.Runner.startToil(firstJob, options)
                        i = 0
                    except FailedJobsException as e:
                        i = e.numberOfFailedJobs
                        if totalTrys > 16: #p(fail after this many restarts) = 0.5**32
                            self.fail() #Exceeded a reasonable number of restarts    
                        totalTrys += 1    

                # Now check that if you try to restart from here it will raise an exception
                # indicating that there are no jobs remaining in the workflow.
                try:
                    Job.Runner.startToil(firstJob, options)
                    self.fail()
                except JobException:
                    pass

                # Now check the file is properly sorted..
                with open(tempSortFile, 'r') as fileHandle:
                    l2 = fileHandle.readlines()
                    self.assertEquals(l, l2)
            finally:
                subprocess.check_call([resolveEntryPoint('toil'), 'clean', jobStore])
Example #13
0
    def _toilSort(self, jobStoreLocator, batchSystem,
                  lines=defaultLines, N=defaultN, testNo=1, lineLen=defaultLineLen,
                  retryCount=2, badWorker=0.5, downCheckpoints=False, disableCaching=False):
        """
        Generate a file consisting of the given number of random lines, each line of the given
        length. Sort the file with Toil by splitting the file recursively until each part is less
        than the given number of bytes, sorting each part and merging them back together. Then
        verify the result.

        :param jobStoreLocator: The location of the job store.

        :param batchSystem: the name of the batch system

        :param lines: the number of random lines to generate

        :param N: the size in bytes of each split

        :param testNo: the number of repeats of this test

        :param lineLen: the length of each random line in the file
        """
        for test in xrange(testNo):
            try:
                # Specify options
                options = Job.Runner.getDefaultOptions(jobStoreLocator)
                options.logLevel = getLogLevelString()
                options.retryCount = retryCount
                options.batchSystem = batchSystem
                options.clean = "never"
                options.badWorker = badWorker
                options.badWorkerFailInterval = 0.05
                options.disableCaching = disableCaching  # FIXME maybe this line should be deleted
                options.downCheckpoints = downCheckpoints
                options.N = N

                # Make the file to sort
                tempSortFile = os.path.join(self.tempDir, "fileToSort.txt")
                makeFileToSort(tempSortFile, lines=lines, lineLen=lineLen)
                options.fileToSort = tempSortFile

                # First make our own sorted version
                with open(tempSortFile, 'r') as fileHandle:
                    l = fileHandle.readlines()
                    l.sort()

                # Check we get an exception if we try to restart a workflow that doesn't exist
                options.restart = True
                with self.assertRaises(NoSuchJobStoreException):
                    main(options)

                options.restart = False

                # Now actually run the workflow
                try:
                    main(options)
                    i = 0
                except FailedJobsException as e:
                    i = e.numberOfFailedJobs

                # Check we get an exception if we try to run without restart on an existing store
                with self.assertRaises(JobStoreExistsException):
                    main(options)

                options.restart = True

                # This loop tests the restart behavior
                totalTrys = 1
                while i != 0:
                    options.useExistingOptions = random.random() > 0.5
                    try:
                        main(options)
                        i = 0
                    except FailedJobsException as e:
                        i = e.numberOfFailedJobs
                        if totalTrys > 32:  # p(fail after this many restarts) = 0.5**32
                            self.fail('Exceeded a reasonable number of restarts')
                        totalTrys += 1

                # Now check that if you try to restart from here it will raise an exception
                # indicating that there are no jobs remaining in the workflow.
                with self.assertRaises(JobException):
                    main(options)

                # Now check the file is properly sorted..
                with open(tempSortFile, 'r') as fileHandle:
                    l2 = fileHandle.readlines()
                    self.assertEquals(l, l2)
            finally:
                subprocess.check_call([resolveEntryPoint('toil'), 'clean', jobStoreLocator])
Example #14
0
    def _toilSort(self,
                  jobStoreLocator,
                  batchSystem,
                  lines=defaultLines,
                  N=defaultN,
                  testNo=1,
                  lineLen=defaultLineLen,
                  retryCount=2,
                  badWorker=0.5,
                  downCheckpoints=False,
                  disableCaching=False):
        """
        Generate a file consisting of the given number of random lines, each line of the given
        length. Sort the file with Toil by splitting the file recursively until each part is less
        than the given number of bytes, sorting each part and merging them back together. Then
        verify the result.

        :param jobStoreLocator: The location of the job store.

        :param batchSystem: the name of the batch system

        :param lines: the number of random lines to generate

        :param N: the size in bytes of each split

        :param testNo: the number of repeats of this test

        :param lineLen: the length of each random line in the file
        """
        for test in xrange(testNo):
            try:
                # Specify options
                options = Job.Runner.getDefaultOptions(jobStoreLocator)
                options.logLevel = getLogLevelString()
                options.retryCount = retryCount
                options.batchSystem = batchSystem
                options.clean = "never"
                options.badWorker = badWorker
                options.badWorkerFailInterval = 0.05
                options.disableCaching = disableCaching  # FIXME maybe this line should be deleted
                options.downCheckpoints = downCheckpoints
                options.N = N

                # Make the file to sort
                tempSortFile = os.path.join(self.tempDir, "fileToSort.txt")
                makeFileToSort(tempSortFile, lines=lines, lineLen=lineLen)
                options.fileToSort = tempSortFile

                # First make our own sorted version
                with open(tempSortFile, 'r') as fileHandle:
                    l = fileHandle.readlines()
                    l.sort()

                # Check we get an exception if we try to restart a workflow that doesn't exist
                options.restart = True
                with self.assertRaises(NoSuchJobStoreException):
                    main(options)

                options.restart = False

                # Now actually run the workflow
                try:
                    main(options)
                    i = 0
                except FailedJobsException as e:
                    i = e.numberOfFailedJobs

                # Check we get an exception if we try to run without restart on an existing store
                with self.assertRaises(JobStoreExistsException):
                    main(options)

                options.restart = True

                # This loop tests the restart behavior
                totalTrys = 1
                while i != 0:
                    options.useExistingOptions = random.random() > 0.5
                    try:
                        main(options)
                        i = 0
                    except FailedJobsException as e:
                        i = e.numberOfFailedJobs
                        if totalTrys > 32:  # p(fail after this many restarts) = 0.5**32
                            self.fail(
                                'Exceeded a reasonable number of restarts')
                        totalTrys += 1

                # Now check that if you try to restart from here it will raise an exception
                # indicating that there are no jobs remaining in the workflow.
                with self.assertRaises(JobException):
                    main(options)

                # Now check the file is properly sorted..
                with open(tempSortFile, 'r') as fileHandle:
                    l2 = fileHandle.readlines()
                    self.assertEquals(l, l2)
            finally:
                subprocess.check_call(
                    [resolveEntryPoint('toil'), 'clean', jobStoreLocator])
Example #15
0
    def _toilSort(self,
                  jobStore,
                  batchSystem,
                  lines=defaultLines,
                  N=defaultN,
                  testNo=1,
                  lineLen=defaultLineLen):
        """
        Generate a file consisting of the given number of random lines, each line of the given
        length. Sort the file with Toil by splitting the file recursively until each part is less
        than the given number of bytes, sorting each part and merging them back together. Then
        verify the result.

        :param jobStore: a job store string

        :param batchSystem: the name of the batch system

        :param lines: the number of random lines to generate

        :param N: the size in bytes of each split

        :param testNo: the number of repeats of this test

        :param lineLen: the length of each random line in the file
        """
        for test in xrange(testNo):
            try:
                # Specify options
                options = Job.Runner.getDefaultOptions(jobStore)
                options.logLevel = getLogLevelString()
                options.retryCount = 2
                options.batchSystem = batchSystem
                options.clean = "never"
                options.badWorker = 0.5
                options.badWorkerFailInterval = 0.05

                # Make the file to sort
                tempSortFile = os.path.join(self.tempDir, "fileToSort.txt")
                makeFileToSort(tempSortFile, lines=lines, lineLen=lineLen)

                # First make our own sorted version
                with open(tempSortFile, 'r') as fileHandle:
                    l = fileHandle.readlines()
                    l.sort()

                # Make the first job
                firstJob = Job.wrapJobFn(setup,
                                         tempSortFile,
                                         N,
                                         memory=sortMemory)

                # Check we get an exception if we try to restart a workflow that doesn't exist
                options.restart = True
                try:
                    Job.Runner.startToil(firstJob, options)
                    self.fail()
                except JobStoreCreationException:
                    pass

                options.restart = False

                # Now actually run the workflow
                try:
                    Job.Runner.startToil(firstJob, options)
                    i = 0
                except FailedJobsException as e:
                    i = e.numberOfFailedJobs

                # Check we get an exception if we try to run without restart on an existing store
                try:
                    Job.Runner.startToil(firstJob, options)
                    self.fail()
                except JobStoreCreationException:
                    pass

                options.restart = True

                # This loop tests the restart behavior
                totalTrys = 1
                while i != 0:
                    options.useExistingOptions = random.random() > 0.5
                    try:
                        Job.Runner.startToil(firstJob, options)
                        i = 0
                    except FailedJobsException as e:
                        i = e.numberOfFailedJobs
                        if totalTrys > 16:  #p(fail after this many restarts) = 0.5**32
                            self.fail(
                            )  #Exceeded a reasonable number of restarts
                        totalTrys += 1

                # Now check that if you try to restart from here it will raise an exception
                # indicating that there are no jobs remaining in the workflow.
                try:
                    Job.Runner.startToil(firstJob, options)
                    self.fail()
                except JobException:
                    pass

                # Now check the file is properly sorted..
                with open(tempSortFile, 'r') as fileHandle:
                    l2 = fileHandle.readlines()
                    self.assertEquals(l, l2)
            finally:
                subprocess.check_call(
                    [resolveEntryPoint('toil'), 'clean', jobStore])
Example #16
0
 def toilMain(self):
     return resolveEntryPoint('toil')
Example #17
0
    def _toilSort(self,
                  jobStoreLocator,
                  batchSystem,
                  lines=defaultLines,
                  N=defaultN,
                  testNo=1,
                  lineLen=defaultLineLen,
                  retryCount=2,
                  badWorker=0.5,
                  downCheckpoints=False,
                  disableCaching=False):
        """
        Generate a file consisting of the given number of random lines, each line of the given
        length. Sort the file with Toil by splitting the file recursively until each part is less
        than the given number of bytes, sorting each part and merging them back together. Then
        verify the result.

        :param jobStoreLocator: The location of the job store.

        :param batchSystem: the name of the batch system

        :param lines: the number of random lines to generate

        :param N: the size in bytes of each split

        :param testNo: the number of repeats of this test

        :param lineLen: the length of each random line in the file
        """
        for test in range(testNo):
            try:
                # Specify options
                options = Job.Runner.getDefaultOptions(jobStoreLocator)
                options.logLevel = getLogLevelString()
                options.retryCount = retryCount
                options.batchSystem = batchSystem
                options.clean = "never"
                options.badWorker = badWorker
                options.badWorkerFailInterval = 0.05
                options.disableCaching = disableCaching
                # This is required because mesosMasterAddress now defaults to the IP of the machine
                # that is starting the workflow while the mesos *tests* run locally.
                if batchSystem == 'mesos':
                    options.mesosMasterAddress = 'localhost:5050'
                options.downCheckpoints = downCheckpoints
                options.N = N
                options.outputFile = self.outputFile
                options.fileToSort = self.inputFile
                options.overwriteOutput = True
                options.realTimeLogging = True

                # Make the file to sort
                makeFileToSort(options.fileToSort,
                               lines=lines,
                               lineLen=lineLen)

                # First make our own sorted version
                with open(options.fileToSort, 'r') as fileHandle:
                    l = fileHandle.readlines()
                    l.sort()

                # Check we get an exception if we try to restart a workflow that doesn't exist
                options.restart = True
                with self.assertRaises(NoSuchJobStoreException):
                    with runMain(options):
                        # Now check the file is properly sorted..
                        with open(options.outputFile, 'r') as fileHandle:
                            l2 = fileHandle.readlines()
                            self.assertEquals(l, l2)

                options.restart = False

                # Now actually run the workflow
                try:
                    with runMain(options):
                        pass
                    i = 0
                except FailedJobsException as e:
                    i = e.numberOfFailedJobs

                # Check we get an exception if we try to run without restart on an existing store
                with self.assertRaises(JobStoreExistsException):
                    with runMain(options):
                        pass

                options.restart = True

                # This loop tests the restart behavior
                totalTrys = 1
                while i != 0:
                    options.useExistingOptions = random.random() > 0.5
                    try:
                        with runMain(options):
                            pass
                        i = 0
                    except FailedJobsException as e:
                        i = e.numberOfFailedJobs
                        if totalTrys > 32:  # p(fail after this many restarts) = 0.5**32
                            self.fail(
                                'Exceeded a reasonable number of restarts')
                        totalTrys += 1
            finally:
                subprocess.check_call(
                    [resolveEntryPoint('toil'), 'clean', jobStoreLocator])
                # final test to make sure the jobStore was actually deleted
                self.assertRaises(NoSuchJobStoreException, Toil.resumeJobStore,
                                  jobStoreLocator)