def get_job_info(self): request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID( value=self.job_id)) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp.job_info
def get_job(self): """ :return: the configuration and runtime status of a job. """ request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID( value=self.job_id)) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp
def wait_for_workflow_state(self, goal_state="SUCCEEDED", failed_state="FAILED"): """ Waits for the job workflow to reach a particular state :param goal_state: The state to reach :param failed_state: The failed state of the job """ state = "" attempts = 0 start = time.time() log.info("%s waiting for state workflow %s", self.job_id, goal_state) state_transition_failure = False # convert the name from v0 state name to v1 alpha state name, # so the function signature can be shared between the apis goal_state = "WORKFLOW_STATE_" + goal_state failed_state = "WORKFLOW_STATE_" + failed_state instance_completed = 0 while attempts < self.config.max_retry_attempts: try: request = stateless_svc.GetJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id)) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) status = resp.workflow_info.status new_state = stateless.WorkflowState.Name(status.state) if state != new_state: log.info("%s transitioned to state %s", self.job_id, new_state) state = new_state if state == goal_state: break # If we assert here, we will log the exception, # and continue with the finally block. Set a flag # here to indicate failure and then break the loop # in the finally block if state == failed_state: state_transition_failure = True except Exception as e: log.warn(e) attempts += 1 else: # for workflow, we only begin to count attempts when no progress is made if instance_completed == status.num_instances_completed + status.num_instances_failed: attempts += 1 else: instance_completed = status.num_instances_completed + status.num_instances_failed attempts = 0 finally: if state_transition_failure: break time.sleep(self.config.sleep_time_sec) if state_transition_failure: log.info( "goal_state:%s current_state:%s attempts: %s", goal_state, state, str(attempts), ) assert False if attempts == self.config.max_retry_attempts: log.info("%s max attempts reached to wait for goal state", self.job_id) log.info("goal_state:%s current_state:%s", goal_state, state) assert False end = time.time() elapsed = end - start log.info("%s state transition took %s seconds", self.job_id, elapsed)
def wait_for_state(self, goal_state='SUCCEEDED', failed_state='FAILED'): """ Waits for the job to reach a particular state :param goal_state: The state to reach :param failed_state: The failed state of the job """ state = '' attempts = 0 start = time.time() log.info('%s waiting for state %s', self.job_id, goal_state) state_transition_failure = False # convert the name from v0 state name to v1 alpha state name, # so the function signature can be shared between the apis goal_state = 'JOB_STATE_' + goal_state failed_state = 'JOB_STATE_' + failed_state while attempts < self.config.max_retry_attempts: try: request = stateless_svc.GetJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), ) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) status = resp.job_info.status new_state = stateless.JobState.Name(status.state) if state != new_state: log.info('%s transitioned to state %s', self.job_id, new_state) state = new_state if state == goal_state: break # If we assert here, we will log the exception, # and continue with the finally block. Set a flag # here to indicate failure and then break the loop # in the finally block if state == failed_state: state_transition_failure = True except Exception as e: log.warn(e) finally: if state_transition_failure: break time.sleep(self.config.sleep_time_sec) attempts += 1 if state_transition_failure: log.info('goal_state:%s current_state:%s attempts: %s', goal_state, state, str(attempts)) assert False if attempts == self.config.max_retry_attempts: log.info('%s max attempts reached to wait for goal state', self.job_id) log.info('goal_state:%s current_state:%s', goal_state, state) assert False end = time.time() elapsed = end - start log.info('%s state transition took %s seconds', self.job_id, elapsed) assert state == goal_state