def get_job_info(self):
     request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID(
         value=self.job_id))
     resp = self.client.stateless_svc.GetJob(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=default_timeout,
     )
     return resp.job_info
Exemple #2
0
 def get_job(self):
     """
     :return: the configuration and runtime status of a job.
     """
     request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID(
         value=self.job_id))
     resp = self.client.stateless_svc.GetJob(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     return resp
Exemple #3
0
    def wait_for_workflow_state(self,
                                goal_state="SUCCEEDED",
                                failed_state="FAILED"):
        """
        Waits for the job workflow to reach a particular state
        :param goal_state: The state to reach
        :param failed_state: The failed state of the job
        """
        state = ""
        attempts = 0
        start = time.time()
        log.info("%s waiting for state workflow %s", self.job_id, goal_state)
        state_transition_failure = False
        # convert the name from v0 state name to v1 alpha state name,
        # so the function signature can be shared between the apis
        goal_state = "WORKFLOW_STATE_" + goal_state
        failed_state = "WORKFLOW_STATE_" + failed_state
        instance_completed = 0
        while attempts < self.config.max_retry_attempts:
            try:
                request = stateless_svc.GetJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id))
                resp = self.client.stateless_svc.GetJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
                status = resp.workflow_info.status
                new_state = stateless.WorkflowState.Name(status.state)
                if state != new_state:
                    log.info("%s transitioned to state %s", self.job_id,
                             new_state)
                state = new_state
                if state == goal_state:
                    break
                # If we assert here, we will log the exception,
                # and continue with the finally block. Set a flag
                # here to indicate failure and then break the loop
                # in the finally block
                if state == failed_state:
                    state_transition_failure = True
            except Exception as e:
                log.warn(e)
                attempts += 1
            else:
                # for workflow, we only begin to count attempts when no progress is made
                if instance_completed == status.num_instances_completed + status.num_instances_failed:
                    attempts += 1
                else:
                    instance_completed = status.num_instances_completed + status.num_instances_failed
                    attempts = 0
            finally:
                if state_transition_failure:
                    break
                time.sleep(self.config.sleep_time_sec)

        if state_transition_failure:
            log.info(
                "goal_state:%s current_state:%s attempts: %s",
                goal_state,
                state,
                str(attempts),
            )
            assert False

        if attempts == self.config.max_retry_attempts:
            log.info("%s max attempts reached to wait for goal state",
                     self.job_id)
            log.info("goal_state:%s current_state:%s", goal_state, state)
            assert False

        end = time.time()
        elapsed = end - start
        log.info("%s state transition took %s seconds", self.job_id, elapsed)
Exemple #4
0
    def wait_for_state(self, goal_state='SUCCEEDED', failed_state='FAILED'):
        """
        Waits for the job to reach a particular state
        :param goal_state: The state to reach
        :param failed_state: The failed state of the job
        """
        state = ''
        attempts = 0
        start = time.time()
        log.info('%s waiting for state %s', self.job_id, goal_state)
        state_transition_failure = False
        # convert the name from v0 state name to v1 alpha state name,
        # so the function signature can be shared between the apis
        goal_state = 'JOB_STATE_' + goal_state
        failed_state = 'JOB_STATE_' + failed_state
        while attempts < self.config.max_retry_attempts:
            try:
                request = stateless_svc.GetJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id), )
                resp = self.client.stateless_svc.GetJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
                status = resp.job_info.status
                new_state = stateless.JobState.Name(status.state)
                if state != new_state:
                    log.info('%s transitioned to state %s', self.job_id,
                             new_state)
                state = new_state
                if state == goal_state:
                    break
                # If we assert here, we will log the exception,
                # and continue with the finally block. Set a flag
                # here to indicate failure and then break the loop
                # in the finally block
                if state == failed_state:
                    state_transition_failure = True
            except Exception as e:
                log.warn(e)
            finally:
                if state_transition_failure:
                    break
                time.sleep(self.config.sleep_time_sec)
                attempts += 1

        if state_transition_failure:
            log.info('goal_state:%s current_state:%s attempts: %s', goal_state,
                     state, str(attempts))
            assert False

        if attempts == self.config.max_retry_attempts:
            log.info('%s max attempts reached to wait for goal state',
                     self.job_id)
            log.info('goal_state:%s current_state:%s', goal_state, state)
            assert False

        end = time.time()
        elapsed = end - start
        log.info('%s state transition took %s seconds', self.job_id, elapsed)
        assert state == goal_state