def stop_task(self, job_id, instance_id): """ param job_id: id of the job param instance_id: instance id of the task to stop type job_id: str type instance_id: int rtype: task.StopResponse """ rng = task.InstanceRange(to=instance_id + 1) setattr(rng, "from", instance_id) request = task.StopRequest( jobId=peloton.JobID(value=job_id), ranges=[rng] ) try: print_okblue("Stopping task %d of Job %s" % (instance_id, job_id)) resp = self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling Stop Tasks :%s" % str(e)) raise
def update_stateless_job(self, job_id, new_job_config): """ param job_id: id of the job param new_job_config: new config of the job type job_id: str type new_job_config: job.JobConfig rtype: job.UpdateResponse """ request = update_svc.CreateUpdateRequest( jobId=peloton.JobID(value=job_id), jobConfig=new_job_config, updateConfig=update_pb2.UpdateConfig(), ) try: print_okblue("Updating Job %s" % job_id) resp = self.client.update_svc.CreateUpdate( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling Update Stateless Job: %s" % str(e)) raise
def stop_job(self): request = task.StopRequest(jobId=peloton.JobID(value=self.job_id)) self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, )
def update_job( self, instance_inc, batch_size, use_instance_config, sleep_time, host_limit_1=False, ): job_info = self.get_job_info() job_config = job_info.config if use_instance_config: instance_config = {} for i in range(0, instance_inc): count = job_config.instanceCount + i instance_config[count] = self.create_pod_config( sleep_time, "instance %s" % i) job_config.instanceConfig.MergeFrom(instance_config) job_config.instanceCount = job_config.instanceCount + instance_inc request = job.UpdateRequest(id=peloton.JobID(value=self.job_id), config=job_config) self.client.job_svc.Update( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, )
def rolling_restart(self, ranges=None, resource_version=None, batch_size=None): """ Restart a job or certain tasks in a rolling fashion based on the ranges and batch size :param ranges: the instance ranges to stop :param resource_version: the resource_version to use, if not set. the API would fetch if from job runtime :param batch_size: the batch size of rolling stop :return: WorkflowResp """ if resource_version is None: job_info = self.get_info() resource_version = job_info.runtime.configurationVersion request = job.RestartRequest( id=peloton.JobID(value=self.job_id), ranges=ranges, resourceVersion=resource_version, restartConfig=job.RestartConfig(batchSize=batch_size), ) resp = self.client.job_svc.Restart( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return Job.WorkflowResp( resp.updateID.value, resp.resourceVersion, client=self.client, config=self.config, )
def test__delete_active_job(jobs_by_state): job = jobs_by_state[1]['RUNNING'][0] job.create() job.wait_for_state(goal_state='RUNNING') client = Client() request = job_pb2.DeleteRequest( id=peloton.JobID(value=job.job_id), ) failed = True try: client.job_svc.Delete( request, metadata=client.jobmgr_metadata, timeout=10, ) failed = False except grpc.RpcError as e: log.info(e) errmsg = "Job is not in a terminal state" assert errmsg in e.details() assert e.code() is grpc.StatusCode.INTERNAL job.stop() job.wait_for_state(goal_state='KILLED') assert failed is True
def get_job_info(self): request = job.GetRequest(id=peloton.JobID(value=self.job_id)) resp = self.client.job_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp.jobInfo
def get_pod_events(self, instance_id): """ :return: A list of all of the pod events of a given task """ request = task.GetPodEventsRequest( jobId=peloton.JobID(value=self.job_id), instanceId=instance_id) resp = self.client.task_svc.GetPodEvents( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp.result
def list_tasks(self): """ :return: The map of instance ID to task info for all matching tasks """ request = task.ListRequest(jobId=peloton.JobID(value=self.job_id)) resp = self.client.task_svc.List( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not resp.HasField("notFound") return resp.result
def get_info(self): """ :return: The job info """ request = job.GetRequest(id=peloton.JobID(value=self.job_id)) resp = self.client.job_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not resp.HasField("error") return resp.jobInfo
def query_request( job_id, task_state=None, pagination=None, names=None, hosts=None ): """ Constructs a task.QueryRequest object for task query api. """ job_id_spec = peloton.JobID(value=job_id) task_states = [task_state] if task_state else [] pagination = pagination if pagination else default_pagination task_query_spec = task.QuerySpec( taskStates=task_states, pagination=pagination, names=names, hosts=hosts ) request = task.QueryRequest(jobId=job_id_spec, spec=task_query_spec) return request
def create(self, config_version=None): """ creates an update based on the job and config. if config_version is provided, create will use the provided value, and raise an exception if version is wrong. if config_version is provided, create will query job runtime to get config version and retry until version is correct. :return: the update ID """ respool_id = self.pool.ensure_exists() self.updated_job_config.respoolID.value = respool_id while True: job_config_version = self.job.get_runtime().configurationVersion self.updated_job_config.changeLog.version = (config_version or job_config_version) request = update_svc.CreateUpdateRequest( jobId=peloton.JobID(value=self.job.job_id), jobConfig=self.updated_job_config, updateConfig=update.UpdateConfig( batchSize=self.batch_size, rollbackOnFailure=self.roll_back_on_failure, maxInstanceAttempts=self.max_instance_attempts, maxFailureInstances=self.max_failure_instances, startPaused=self.start_paused, ), ) try: resp = self.client.update_svc.CreateUpdate( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if (e.code() == grpc.StatusCode.ABORTED and e.details() == INVALID_VERSION_ERR_MESSAGE and config_version is None): continue raise break assert resp.updateID.value self.workflow = Workflow(resp.updateID.value, client=self.client, config=self.config) log.info("created update %s", self.workflow.workflow_id)
def get_task_runs(self, instance_id): """ :param instance_id: The instance id of the task :return: Returns all active and completed tasks of the given instance """ request = task.GetRequest(jobId=peloton.JobID(value=self.job_id), instanceId=instance_id) resp = self.client.task_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not resp.HasField("notFound") assert not resp.HasField("outOfRange") return resp.results
def get_task_info(self, instance_id): """ :param instance_id: The instance id of the task :return: The task info for the instance id """ request = task.GetRequest(jobId=peloton.JobID(value=self.job_id), instanceId=instance_id) resp = self.client.task_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not resp.HasField("notFound") assert not resp.HasField("outOfRange") return resp.result
def test__delete_completed_job(jobs_by_state): job = jobs_by_state[1]["SUCCEEDED"][0] job.create() job.wait_for_state() client = Client() request = job_pb2.DeleteRequest(id=peloton.JobID(value=job.job_id)) try: client.job_svc.Delete(request, metadata=client.jobmgr_metadata, timeout=10) except grpc.RpcError as e: log.info(e) assert e is None
def delete(self): """ Deletes a job :return: delete job response from the API """ request = job.DeleteRequest(id=peloton.JobID(value=self.job_id)) response = self.client.job_svc.Delete( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not response.HasField("error") log.info("deleting job {0}".format(self.job_id)) return response
def test__delete_non_existing_job(peloton_client): client = peloton_client request = job_pb2.DeleteRequest(id=peloton.JobID( value="00010203-0405-0607-0809-0a0b0c0d0e0f")) failed = True try: client.job_svc.Delete(request, metadata=client.jobmgr_metadata, timeout=10) failed = False except grpc.RpcError as e: log.info(e) assert e.details() == "job not found" assert e.code() is grpc.StatusCode.NOT_FOUND assert failed is True
def stop(self, ranges=None): """ Stops a job or certain tasks based on the ranges :param ranges: the instance ranges to stop :return: task stop response from the API """ request = task.StopRequest(jobId=peloton.JobID(value=self.job_id), ranges=ranges) response = self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) log.info("stopping tasks in job {0} with ranges {1}".format( self.job_id, ranges)) return response
def browse_task_sandbox(self, instance_id, task_id): """ :param instance_id: The instance id of the task :param task_id: The mesos task id of the task :return: The BrowseSandboxResponse """ request = task.BrowseSandboxRequest( jobId=peloton.JobID(value=self.job_id), instanceId=instance_id, taskId=task_id) resp = self.client.task_svc.BrowseSandbox( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not resp.HasField('error') return resp
def get_tasks(self, job_id): """ param job_id: id of the job type job_id: str rtype: job.ListResponse """ request = task.ListRequest(jobId=peloton.JobID(value=job_id)) try: resp = self.client.task_svc.List( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ).result.value return resp except Exception as e: print_fail("Exception calling List Tasks :%s" % str(e)) raise
def get_job(self, job_id): """ :param job_id: the id of the job :type job_id: str :rtype: Response """ request = job.GetRequest(id=peloton.JobID(value=job_id)) try: resp = self.client.job_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling Get job :%s" % str(e)) raise
def delete_job(self, job_id): """ param job_id: id of the job type job_id: str rtype: job.DeleteResponse """ request = job.DeleteRequest(id=peloton.JobID(value=job_id)) try: print_okblue("Deleting job %s" % job_id) resp = self.client.job_svc.Delete( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling delete job :%s" % str(e)) raise
def stop_job(self, job_id): """ param job_id: id of the job type job_id: str rtype: job.StopResponse """ request = task.StopRequest(jobId=peloton.JobID(value=job_id)) try: print_okblue("Killing all tasks of Job %s" % job_id) resp = self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling List Tasks :%s" % str(e)) raise
def wait_for_terminated(self): """ Waits for the job to be terminated """ state = '' attempts = 0 log.info('%s waiting for terminal state', self.job_id) terminated = False while attempts < self.config.max_retry_attempts: try: request = job.GetRequest( id=peloton.JobID(value=self.job_id), ) resp = self.client.job_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) runtime = resp.jobInfo.runtime new_state = job.JobState.Name(runtime.state) if state != new_state: log.info('%s transitioned to state %s', self.job_id, new_state) state = new_state if state in ['SUCCEEDED', 'FAILED', 'KILLED']: terminated = True break log.debug(format_stats(runtime.taskStats)) except Exception as e: log.warn(e) finally: time.sleep(self.config.sleep_time_sec) attempts += 1 if terminated: log.info('%s job terminated', self.job_id) assert True if attempts == self.config.max_retry_attempts: log.info('% max attempts reached to wait for goal state', self.job_id) log.info('current_state:%s', state) assert False
def update(self, new_job_file): """ updates a job :param new_job_file: The job config file used for updating """ job_config_dump = load_test_config(new_job_file) new_job_config = job.JobConfig() json_format.ParseDict(job_config_dump, new_job_config) request = job.UpdateRequest(id=peloton.JobID(value=self.job_id), config=new_job_config) resp = self.client.job_svc.Update( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) assert not resp.HasField("error") # update the config self.job_config = new_job_config log.info("updated job %s", self.job_id)
def wait_for_leader(self): """ utility method to wait for job manger leader to come up. good practice to check before all write apis """ attempts = 0 while attempts < self.config.max_retry_attempts: try: request = job.DeleteRequest(id=peloton.JobID( value="dummy_job_id")) self.client.job_svc.Delete( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: if e.code() == grpc.StatusCode.UNAVAILABLE: time.sleep(self.config.sleep_time_sec) attempts += 1 continue break
def start(self, ranges=None): """ Starts a job or certain tasks based on the ranges :param ranges: the instance ranges to start :return: task start response from the API """ # wait for job manager leader self.wait_for_jobmgr_available() request = task.StartRequest( jobId=peloton.JobID(value=self.job_id), ranges=ranges ) response = self.client.task_svc.Start( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) log.info( "starting tasks in job {0} with ranges {1}".format( self.job_id, ranges ) ) return response
def update_job(self, job_id, new_job_config): """ param job_id: id of the job param new_job_config: new config of the job type job_id: str type new_job_config: job.JobConfig rtype: job.UpdateResponse """ request = job.UpdateRequest( id=peloton.JobID(value=job_id), config=new_job_config, ) try: print_okblue("Updating Job %s" % job_id) resp = self.client.job_svc.Update( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception, e: print_fail('Exception calling Update Job: %s' % str(e)) raise
def wait_for_state(self, goal_state="SUCCEEDED", failed_state="FAILED"): """ Waits for the job to reach a particular state :param goal_state: The state to reach :param failed_state: The failed state of the job """ state = "" attempts = 0 start = time.time() log.info("%s waiting for state %s", self.job_id, goal_state) state_transition_failure = False while attempts < self.config.max_retry_attempts: try: request = job.GetRequest(id=peloton.JobID(value=self.job_id)) resp = self.client.job_svc.Get( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) runtime = resp.jobInfo.runtime new_state = job.JobState.Name(runtime.state) if state != new_state: log.info("%s transitioned to state %s", self.job_id, new_state) state = new_state if state == goal_state: break log.debug(format_stats(runtime.taskStats)) # If we assert here, we will log the exception, # and continue with the finally block. Set a flag # here to indicate failure and then break the loop # in the finally block if state == failed_state: state_transition_failure = True except Exception as e: log.warn(e) finally: if state_transition_failure: break time.sleep(self.config.sleep_time_sec) attempts += 1 if state_transition_failure: task_failure = self._get_task_failure() log.info( "%s goal_state:%s current_state:%s attempts: %s", self.job_id, goal_state, state, str(attempts), ) for t, failure in task_failure.iteritems(): log.info( "%s task id:%s failed for reason:%s message:%s", self.job_id, t, failure["reason"], failure["message"], ) assert False if attempts == self.config.max_retry_attempts: log.info("%s max attempts reached to wait for goal state", self.job_id) log.info( "%s goal_state:%s current_state:%s", self.job_id, goal_state, state, ) assert False end = time.time() elapsed = end - start log.info("%s state transition took %s seconds", self.job_id, elapsed) assert state == goal_state