Beispiel #1
0
    def test_lost_job_execution(self):
        """Tests running through a job execution that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets lost
        when_lost = pre_task_completed + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        lost_task = running_job_exe.execution_lost(when_lost)
        self.assertEqual(job_task.id, lost_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id)
        self.assertEqual(when_lost, job_exe.ended)
Beispiel #2
0
    def test_pre_task_launch_error(self):
        """Tests running through a job execution where a pre-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task fails to launch
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = now()
        running_job_exe.task_fail(pre_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(
            id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Beispiel #3
0
    def test_lost_job_execution(self):
        """Tests running through a job execution that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started)
        running_job_exe.task_update(update)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task and then execution gets lost
        when_lost = pre_task_completed + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        lost_task = running_job_exe.execution_lost(when_lost)
        self.assertEqual(job_task.id, lost_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id)
        self.assertEqual(when_lost, job_exe.ended)
Beispiel #4
0
    def test_consider_next_task(self):
        """Tests consider_next_task() and get_accepted_running_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_next_task(
            job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus)
        result = node_offers.consider_next_task(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem)
        result = node_offers.consider_next_task(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk)
        result = node_offers.consider_next_task(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()),
                            {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
Beispiel #5
0
    def test_lost_node(self):
        """Tests accepting a running and queued job execution and then the node being lost"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.lost_node(self.node_agent)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Beispiel #6
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING')
        queue_1_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_2_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_3_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_4_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_5_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_6_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        self._job_type_manager.sync_with_database()
        # One job of this type is already running
        self._job_exe_manager.add_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        self._offer_manager.add_new_offers([offer_1, offer_2])

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Beispiel #7
0
    def test_job_exe_canceled(self):
        """Tests adding a job execution that becomes canceled while scheduling"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        job_exe_1.execution_canceled()
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.TASK_INVALID)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
Beispiel #8
0
    def test_pre_task_launch_error(self):
        """Tests running through a job execution where a pre-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task fails to launch
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, now())
        running_job_exe.task_update(update)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Beispiel #9
0
    def test_job_task_launch_error(self):
        """Tests running through a job execution where a Docker-based job-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_running(pre_task_id, pre_task_started, '', '')

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task fails to launch
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = now()
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(
            id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Beispiel #10
0
    def test_docker_terminated_error(self):
        """Tests running through a job execution where a Docker container terminates"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)

        # Pre-task Docker container terminates
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, now(),
                                                          reason='REASON_EXECUTOR_TERMINATED')
        running_job_exe.task_update(update)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-terminated')
Beispiel #11
0
    def test_paused_node(self):
        """Tests adding job executions when the node is paused"""

        node_offers = NodeOffers(self.paused_node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_paused,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Ensure it accepts new tasks for already running job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Don't accept new job executions while paused
        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NODE_PAUSED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()),
                            {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
Beispiel #12
0
    def test_pre_task_launch_error(self):
        """Tests running through a job execution where a pre-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task fails to launch
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.when = now()
        running_job_exe.task_fail(pre_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'task-launch')
Beispiel #13
0
    def test_canceled_job_execution(self):
        """Tests running through a job execution that gets canceled"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started)
        running_job_exe.task_update(update)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task and then execution gets canceled
        job_task = running_job_exe.start_next_task()
        canceled_task = running_job_exe.execution_canceled()
        self.assertEqual(job_task.id, canceled_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())
Beispiel #14
0
    def test_canceled_job_execution(self):
        """Tests running through a job execution that gets canceled"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets canceled
        job_task = running_job_exe.start_next_task()
        canceled_task = running_job_exe.execution_canceled()
        self.assertEqual(job_task.id, canceled_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())
Beispiel #15
0
    def test_job_exe_canceled(self):
        """Tests adding a job execution that becomes canceled while scheduling"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        job_exe_1.execution_canceled()
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.TASK_INVALID)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
Beispiel #16
0
    def test_job_task_launch_error(self):
        """Tests running through a job execution where a Docker-based job-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task fails to launch
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = now()
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Beispiel #17
0
    def test_no_offers(self):
        """Tests adding job executions when there are no offers"""

        node_offers = NodeOffers(self.node)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
Beispiel #18
0
def create_task_update_model(status):
    """Creates and returns a task update model for the given Mesos task status

    :param status: The task status
    :type status: :class:`mesos_pb2.TaskStatus`
    :returns: The task update model
    :rtype: :class:`job.models.TaskUpdate`
    """

    task_update = TaskUpdate()
    task_update.task_id = get_status_task_id(status)
    task_update.job_exe_id = RunningJobExecution.get_job_exe_id(task_update.task_id)
    task_update.status = get_status_state(status)
    task_update.timestamp = get_status_timestamp(status)
    task_update.source = get_status_source(status)
    task_update.reason = get_status_reason(status)
    task_update.message = get_status_message(status)

    return task_update
Beispiel #19
0
def create_task_update_model(status):
    """Creates and returns a task update model for the given Mesos task status

    :param status: The task status
    :type status: :class:`mesos_pb2.TaskStatus`
    :returns: The task update model
    :rtype: :class:`job.models.TaskUpdate`
    """

    task_update = TaskUpdate()
    task_update.task_id = get_status_task_id(status)
    task_update.job_exe_id = RunningJobExecution.get_job_exe_id(
        task_update.task_id)
    task_update.status = get_status_state(status)
    task_update.timestamp = get_status_timestamp(status)
    task_update.source = get_status_source(status)
    task_update.reason = get_status_reason(status)
    task_update.message = get_status_message(status)

    return task_update
Beispiel #20
0
    def test_no_ready_offers(self):
        """Tests considering job executions when no offers are ready"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_OFFLINE)
Beispiel #21
0
    def test_lost_node(self):
        """Tests when the node is lost"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Accept a couple job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertGreater(node_offers._available_cpus, 0.0)
        self.assertGreater(node_offers._available_mem, 0.0)
        self.assertGreater(node_offers._available_disk, 0.0)

        # Node is lost
        node_offers.lost_node()
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Beispiel #22
0
    def test_canceled_job_execution(self):
        """Tests running through a job execution that gets canceled"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets canceled
        job_task = running_job_exe.start_next_task()
        canceled_task = running_job_exe.execution_canceled()
        self.assertEqual(job_task.id, canceled_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())
Beispiel #23
0
    def schedule_job_executions(self, framework_id, job_executions,
                                workspaces):
        """Schedules the given job executions on the provided nodes and resources. The corresponding queue models will
        be deleted from the database. All database changes occur in an atomic transaction.

        :param framework_id: The scheduling framework ID
        :type framework_id: string
        :param job_executions: A list of queued job executions that have been given nodes and resources on which to run
        :type job_executions: list[:class:`queue.job_exe.QueuedJobExecution`]
        :param workspaces: A dict of all workspaces stored by name
        :type workspaces: {string: :class:`storage.models.Workspace`}
        :returns: The scheduled job executions
        :rtype: list[:class:`job.execution.running.job_exe.RunningJobExecution`]
        """

        if not job_executions:
            return []

        job_exe_ids = []
        for job_execution in job_executions:
            job_exe_ids.append(job_execution.id)

        # Lock corresponding job executions
        job_exes = {}
        for job_exe in JobExecution.objects.select_for_update().filter(
                id__in=job_exe_ids).order_by('id'):
            job_exes[job_exe.id] = job_exe

        # Set up job executions to schedule
        executions_to_schedule = []
        for job_execution in job_executions:
            queue = job_execution.queue
            node = job_execution.provided_node
            resources = job_execution.provided_resources
            job_exe = job_exes[job_execution.id]

            # Ignore executions that are no longer queued (executions may have been changed since queue model was last
            # queried)
            if job_exe.status != 'QUEUED':
                continue

            # Check that resources are sufficient
            if resources.cpus < queue.cpus_required:
                msg = 'Job execution requires %s CPUs and only %s were provided'
                raise Exception(
                    msg % (str(queue.cpus_required), str(resources.cpus)))
            if resources.mem < queue.mem_required:
                msg = 'Job execution requires %s MiB of memory and only %s MiB were provided'
                raise Exception(msg %
                                (str(queue.mem_required), str(resources.mem)))
            if resources.disk_in < queue.disk_in_required:
                msg = 'Job execution requires %s MiB of input disk space and only %s MiB were provided'
                raise Exception(
                    msg %
                    (str(queue.disk_in_required), str(resources.disk_in)))
            if resources.disk_out < queue.disk_out_required:
                msg = 'Job execution requires %s MiB of output disk space and only %s MiB were provided'
                raise Exception(
                    msg %
                    (str(queue.disk_out_required), str(resources.disk_out)))
            if resources.disk_total < queue.disk_total_required:
                msg = 'Job execution requires %s MiB of total disk space and only %s MiB were provided'
                raise Exception(msg % (str(
                    queue.disk_total_required), str(resources.disk_total)))

            executions_to_schedule.append((job_exe, node, resources))

        # Schedule job executions
        scheduled_job_exes = []
        for job_exe in JobExecution.objects.schedule_job_executions(
                framework_id, executions_to_schedule, workspaces):
            scheduled_job_exes.append(RunningJobExecution(job_exe))

        # Clear the job executions from the queue
        Queue.objects.filter(job_exe_id__in=job_exe_ids).delete()

        return scheduled_job_exes
Beispiel #24
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        task_id = status.task_id.value
        job_exe_id = RunningJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state))

        # Since we have a status update for this task, remove it from reconciliation set
        self._recon_thread.remove_task_id(task_id)

        try:
            running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id)

            if running_job_exe:
                results = TaskResults(task_id)
                results.exit_code = utils.parse_exit_code(status)
                results.when = utils.get_status_timestamp(status)
                if status.state in [mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED,
                                    mesos_pb2.TASK_KILLED]:
                    try:
                        log_start_time = now()
                        hostname = running_job_exe._node_hostname
                        port = running_job_exe._node_port
                        task_dir = get_slave_task_directory(hostname, port, task_id)
                        results.stdout = get_slave_task_file(hostname, port, task_dir, 'stdout')
                        results.stderr = get_slave_task_file(hostname, port, task_dir, 'stderr')
                        log_end_time = now()
                        logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time))
                    except Exception:
                        logger.exception('Error pulling logs for task %s', task_id)

                # Apply status update to running job execution
                if status.state == mesos_pb2.TASK_RUNNING:
                    hostname = running_job_exe._node_hostname
                    port = running_job_exe._node_port
                    task_dir = get_slave_task_directory(hostname, port, task_id)
                    stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout')
                    stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr')
                    running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url)
                elif status.state == mesos_pb2.TASK_FINISHED:
                    running_job_exe.task_complete(results)
                elif status.state == mesos_pb2.TASK_LOST:
                    running_job_exe.task_fail(results, Error.objects.get_builtin_error('mesos-lost'))
                elif status.state in [mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]:
                    running_job_exe.task_fail(results)

                # Remove finished job execution
                if running_job_exe.is_finished():
                    self._job_exe_manager.remove_job_exe(job_exe_id)
            else:
                # Scheduler doesn't have any knowledge of this job execution
                Queue.objects.handle_job_failure(job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost'))
        except Exception:
            logger.exception('Error handling status update for job execution: %s', job_exe_id)
            # Error handling status update, add task so it can be reconciled
            self._recon_thread.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Beispiel #25
0
    def statusUpdate(self, driver, status):
        '''
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        '''

        started = now()

        task_id = status.task_id.value
        job_exe_id = RunningJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id,
                    utils.status_to_string(status.state))

        # Since we have a status update for this task, remove it from reconciliation set
        self._recon_thread.remove_task_id(task_id)

        try:
            running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id)

            if running_job_exe:
                results = TaskResults(task_id)
                results.exit_code = utils.parse_exit_code(status)
                results.when = utils.get_status_timestamp(status)
                if status.state in [
                        mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR,
                        mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED
                ]:
                    try:
                        log_start_time = now()
                        hostname = running_job_exe._node_hostname
                        port = running_job_exe._node_port
                        task_dir = get_slave_task_directory(
                            hostname, port, task_id)
                        results.stdout = get_slave_task_file(
                            hostname, port, task_dir, 'stdout')
                        results.stderr = get_slave_task_file(
                            hostname, port, task_dir, 'stderr')
                        log_end_time = now()
                        logger.debug('Time to pull logs for task: %s',
                                     str(log_end_time - log_start_time))
                    except Exception:
                        logger.exception('Error pulling logs for task %s',
                                         task_id)

                # Apply status update to running job execution
                if status.state == mesos_pb2.TASK_RUNNING:
                    hostname = running_job_exe._node_hostname
                    port = running_job_exe._node_port
                    task_dir = get_slave_task_directory(
                        hostname, port, task_id)
                    stdout_url = get_slave_task_url(hostname, port, task_dir,
                                                    'stdout')
                    stderr_url = get_slave_task_url(hostname, port, task_dir,
                                                    'stderr')
                    running_job_exe.task_running(task_id, results.when,
                                                 stdout_url, stderr_url)
                elif status.state == mesos_pb2.TASK_FINISHED:
                    running_job_exe.task_complete(results)
                elif status.state == mesos_pb2.TASK_LOST:
                    running_job_exe.task_fail(
                        results, Error.objects.get_builtin_error('mesos-lost'))
                elif status.state in [
                        mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED,
                        mesos_pb2.TASK_KILLED
                ]:
                    running_job_exe.task_fail(results)

                # Remove finished job execution
                if running_job_exe.is_finished():
                    self._job_exe_manager.remove_job_exe(job_exe_id)
            else:
                # Scheduler doesn't have any knowledge of this job execution
                Queue.objects.handle_job_failure(
                    job_exe_id, now(),
                    Error.objects.get_builtin_error('scheduler-lost'))
        except Exception:
            logger.exception(
                'Error handling status update for job execution: %s',
                job_exe_id)
            # Error handling status update, add task so it can be reconciled
            self._recon_thread.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Beispiel #26
0
    def test_general_algorithm_error(self):
        """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code)
        """

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        running_job_exe.task_start(job_task_id, job_task_started)

        # Fail job-task
        job_task_failed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = job_task_failed
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'algorithm-unknown')
Beispiel #27
0
    def test_failed_normal_job_execution(self):
        """Tests running through a normal job execution that fails"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Fail pre-task
        pre_task_failed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED,
                                                          pre_task_failed, exit_code=1)
        running_job_exe.task_update(update)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_failed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual('FAILED', job_exe.status)
        self.assertIsNotNone(job_exe.error_id)
        self.assertGreater(job_exe.ended, pre_task_failed)
Beispiel #28
0
    def test_lost_task(self):
        """Tests running through a job execution that has a task that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started)
        running_job_exe.task_update(update)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        job_task_started = pre_task_completed + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started)
        running_job_exe.task_update(update)
        self.assertTrue(task.has_started)

        # Lose task and make sure the same task is the next one to schedule again
        when_lost = job_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.LOST, when_lost)
        running_job_exe.task_update(update)
        self.assertFalse(task.has_started)
        task = running_job_exe.start_next_task()
        self.assertEqual(job_task_id, task.id)
Beispiel #29
0
    def test_successful_normal_job_execution(self):
        """Tests running through a normal job execution successfully"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task sets updated command arguments
        updated_commands_args = '-arg updated'
        JobExecution.objects.filter(id=self._job_exe_id).update(command_arguments=updated_commands_args)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        self.assertEqual(task._command_arguments, updated_commands_args)  # Make sure job task has updated command args
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Job-task running
        job_task_started = pre_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(job_task_id, job_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 2
        job_task_results.when = job_task_completed
        running_job_exe.task_complete(job_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Post-task running
        post_task_started = job_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(post_task_id, post_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete post-task
        post_task_completed = post_task_started + timedelta(seconds=1)
        post_task_results = TaskResults(post_task_id)
        post_task_results.exit_code = 3
        post_task_results.when = post_task_completed
        running_job_exe.task_complete(post_task_results)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_completed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual(job_task_started, job_exe.job_started)
        self.assertEqual(job_task_completed, job_exe.job_completed)
        self.assertEqual(2, job_exe.job_exit_code)
        self.assertEqual(post_task_started, job_exe.post_started)
        self.assertEqual(post_task_completed, job_exe.post_completed)
        self.assertEqual(3, job_exe.post_exit_code)
        self.assertEqual('COMPLETED', job_exe.status)
        self.assertGreater(job_exe.ended, post_task_completed)
Beispiel #30
0
    def test_failed_normal_job_execution(self):
        """Tests running through a normal job execution that fails"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        error = error_test_utils.create_error()
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(
            minutes=5
        )  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Fail pre-task
        pre_task_failed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_failed
        running_job_exe.task_fail(pre_task_results, error)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_failed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(error.id, job_exe.error_id)
        self.assertGreater(job_exe.ended, pre_task_failed)
Beispiel #31
0
    def test_lost_job_execution(self):
        """Tests running through a job execution that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets lost
        when_lost = pre_task_completed + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        lost_task = running_job_exe.execution_lost(when_lost)
        self.assertEqual(job_task.id, lost_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(
            Error.objects.get_builtin_error('node-lost').id, job_exe.error_id)
        self.assertEqual(when_lost, job_exe.ended)
Beispiel #32
0
    def test_failed_normal_job_execution(self):
        """Tests running through a normal job execution that fails"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        error = error_test_utils.create_error()
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Fail pre-task
        pre_task_failed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_failed
        running_job_exe.task_fail(pre_task_results, error)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_failed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(error.id, job_exe.error_id)
        self.assertGreater(job_exe.ended, pre_task_failed)
Beispiel #33
0
    def test_general_algorithm_error(self):
        """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code)
        """

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        running_job_exe.task_start(job_task_id, job_task_started)

        # Fail job-task
        job_task_failed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = job_task_failed
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(
            id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'algorithm-unknown')
Beispiel #34
0
    def test_successful_normal_job_execution(self):
        """Tests running through a normal job execution successfully"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(
            minutes=5
        )  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task sets updated command arguments
        updated_commands_args = '-arg updated'
        JobExecution.objects.filter(id=self._job_exe_id).update(
            command_arguments=updated_commands_args)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        self.assertEqual(task._command_arguments, updated_commands_args
                         )  # Make sure job task has updated command args
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Job-task running
        job_task_started = pre_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(job_task_id, job_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 2
        job_task_results.when = job_task_completed
        running_job_exe.task_complete(job_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Post-task running
        post_task_started = job_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(post_task_id, post_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete post-task
        post_task_completed = post_task_started + timedelta(seconds=1)
        post_task_results = TaskResults(post_task_id)
        post_task_results.exit_code = 3
        post_task_results.when = post_task_completed
        running_job_exe.task_complete(post_task_results)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_completed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual(job_task_started, job_exe.job_started)
        self.assertEqual(job_task_completed, job_exe.job_completed)
        self.assertEqual(2, job_exe.job_exit_code)
        self.assertEqual(post_task_started, job_exe.post_started)
        self.assertEqual(post_task_completed, job_exe.post_completed)
        self.assertEqual(3, job_exe.post_exit_code)
        self.assertEqual('COMPLETED', job_exe.status)
        self.assertGreater(job_exe.ended, post_task_completed)