def test_bad_job_no_predecessors(self, walk_class, setup_sbx): """Simple case of a leaf job failing.""" actions = DAG() actions.add_vertex("1.bad") c = walk_class(actions) job = c.saved_jobs["1.bad"] assert isinstance(job, ControlledJob) assert job.should_skip is False assert job.status == ReturnValue(1) assert c.job_status == {"1.bad": ReturnValue(1)} assert c.requeued == {} # In the situation where we are using fingerprints, # verify the behavior when re-doing a walk with # the same DAG. if walk_class == FingerprintWalk: r2 = walk_class(actions) job = r2.saved_jobs["1.bad"] assert isinstance(job, ControlledJob) assert job.should_skip is False assert job.status == ReturnValue(1) assert r2.job_status == {"1.bad": ReturnValue(1)} assert r2.requeued == {}
def test_job_depending_on_job_with_no_predicted_fingerprint_failed(setup_sbx): """Test case where job depends on failed job with late fingerprint.""" actions = DAG() actions.add_vertex("fingerprint_after_job.bad") actions.add_vertex("2", predecessors=["fingerprint_after_job.bad"]) r1 = FingerprintWalk(actions) assert ( r1.compute_fingerprint("fingerprint_after_job.bad", None, is_prediction=True) is None ) # Check the status of the first job ('fingerprint_after_job.bad'). # It should be a real job that returned a failure. job = r1.saved_jobs["fingerprint_after_job.bad"] assert isinstance(job, ControlledJob) assert job.should_skip is False assert job.status == ReturnValue(1) # Check the status of the second job ('2'); because that job depends # on a job that failed, it should show that the job was skipped. job = r1.saved_jobs["2"] assert isinstance(job, EmptyJob) assert job.should_skip is True assert job.status == ReturnValue.force_fail # Check that no job was requeued. assert r1.requeued == {}
def test_failed_predecessor(self, walk_class, setup_sbx): """Simulate the scenarior when a predecessor failed.""" actions = DAG() actions.add_vertex("1.bad") actions.add_vertex("2", predecessors=["1.bad"]) c = walk_class(actions) job = c.saved_jobs["1.bad"] assert isinstance(job, ControlledJob) assert job.should_skip is False assert job.status == ReturnValue(1) job = c.saved_jobs["2"] assert isinstance(job, EmptyJob) assert job.should_skip is True assert job.status == ReturnValue.force_fail assert c.job_status == { "1.bad": ReturnValue(1), "2": ReturnValue.force_fail } assert c.requeued == {} # In the situation where we are using fingerprints, # verify the behavior when re-doing a walk with # the same DAG. if walk_class == FingerprintWalk: r2 = walk_class(actions) job = r2.saved_jobs["1.bad"] assert isinstance(job, ControlledJob) assert job.should_skip is False assert job.status == ReturnValue(1) job = r2.saved_jobs["2"] assert isinstance(job, EmptyJob) assert job.should_skip is True assert job.status == ReturnValue.force_fail assert r2.job_status == { "1.bad": ReturnValue(1), "2": ReturnValue.force_fail, } assert r2.requeued == {}
def status(self): """See Job.status' description.""" if self.__spawn_error: return ReturnValue.failure elif self.proc_handle is None: return ReturnValue.notready else: try: return ReturnValue(self.proc_handle.status) except ValueError: logger.exception('job %s returned an unknown status %s', self.uid, self.proc_handle.status) return ReturnValue.failure
def collect(self, job: ProcessJob) -> bool: """Collect all the results from the given job. :param job: The job whose results we need to collect. :return: True if the job is requeued, False otherwise """ # Only save the fingerprint if the job went as expected (either # success or skipped). Since we already removed the previous # fingerprint when we created the job, not saving the fingerprint # ensures that we try that action again next time (as opposed # to skipping it). if job.status in ( ReturnValue.success, ReturnValue.force_skip, ReturnValue.skip, ReturnValue.unchanged, ): self.new_fingerprints[job.uid] = self.compute_fingerprint( job.uid, job.data) self.save_fingerprint(job.uid, self.new_fingerprints[job.uid]) self.job_status[job.uid] = ReturnValue(job.status) if job.should_skip: if job.status not in (ReturnValue.force_fail, ReturnValue.force_skip): logging.info( "[%-10s %-9s %4ds] %s", job.queue_name, self.job_status[job.uid].name, 0, job.data, ) return False logging.info( "[%-10s %-9s %4ds] %s", job.queue_name, job.status.name, int(job.timing_info.duration), job.data, ) requeued = False if self.job_status[job.uid] == ReturnValue.notready: requeued = self.request_requeue(job) return requeued