Beispiel #1
0
    def stop_task(self, job_id, instance_id):
        """
        param job_id: id of the job
        param instance_id: instance id of the task to stop

        type job_id: str
        type instance_id: int

        rtype: task.StopResponse
        """
        rng = task.InstanceRange(to=instance_id + 1)
        setattr(rng, "from", instance_id)
        request = task.StopRequest(
            jobId=peloton.JobID(value=job_id), ranges=[rng]
        )
        try:
            print_okblue("Stopping task %d of Job %s" % (instance_id, job_id))
            resp = self.client.task_svc.Stop(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception as e:
            print_fail("Exception calling Stop Tasks :%s" % str(e))
            raise
Beispiel #2
0
    def update_stateless_job(self, job_id, new_job_config):
        """
        param job_id: id of the job
        param new_job_config: new config of the job
        type job_id: str
        type new_job_config: job.JobConfig

        rtype: job.UpdateResponse
        """
        request = update_svc.CreateUpdateRequest(
            jobId=peloton.JobID(value=job_id),
            jobConfig=new_job_config,
            updateConfig=update_pb2.UpdateConfig(),
        )
        try:
            print_okblue("Updating Job %s" % job_id)
            resp = self.client.update_svc.CreateUpdate(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception as e:
            print_fail("Exception calling Update Stateless Job: %s" % str(e))
            raise
 def stop_job(self):
     request = task.StopRequest(jobId=peloton.JobID(value=self.job_id))
     self.client.task_svc.Stop(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=default_timeout,
     )
 def update_job(
     self,
     instance_inc,
     batch_size,
     use_instance_config,
     sleep_time,
     host_limit_1=False,
 ):
     job_info = self.get_job_info()
     job_config = job_info.config
     if use_instance_config:
         instance_config = {}
         for i in range(0, instance_inc):
             count = job_config.instanceCount + i
             instance_config[count] = self.create_pod_config(
                 sleep_time, "instance %s" % i)
         job_config.instanceConfig.MergeFrom(instance_config)
     job_config.instanceCount = job_config.instanceCount + instance_inc
     request = job.UpdateRequest(id=peloton.JobID(value=self.job_id),
                                 config=job_config)
     self.client.job_svc.Update(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=default_timeout,
     )
Beispiel #5
0
    def rolling_restart(self,
                        ranges=None,
                        resource_version=None,
                        batch_size=None):
        """
        Restart a job or certain tasks in a rolling fashion based on the ranges
        and batch size
        :param ranges: the instance ranges to stop
        :param resource_version: the resource_version to use,
            if not set. the API would fetch if from job runtime
        :param batch_size: the batch size of rolling stop
       :return: WorkflowResp
       """
        if resource_version is None:
            job_info = self.get_info()
            resource_version = job_info.runtime.configurationVersion

        request = job.RestartRequest(
            id=peloton.JobID(value=self.job_id),
            ranges=ranges,
            resourceVersion=resource_version,
            restartConfig=job.RestartConfig(batchSize=batch_size),
        )
        resp = self.client.job_svc.Restart(
            request,
            metadata=self.client.jobmgr_metadata,
            timeout=self.config.rpc_timeout_sec,
        )
        return Job.WorkflowResp(
            resp.updateID.value,
            resp.resourceVersion,
            client=self.client,
            config=self.config,
        )
Beispiel #6
0
def test__delete_active_job(jobs_by_state):
    job = jobs_by_state[1]['RUNNING'][0]
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    client = Client()
    request = job_pb2.DeleteRequest(
        id=peloton.JobID(value=job.job_id),
    )
    failed = True
    try:
        client.job_svc.Delete(
            request,
            metadata=client.jobmgr_metadata,
            timeout=10,
        )
        failed = False
    except grpc.RpcError as e:
        log.info(e)
        errmsg = "Job is not in a terminal state"
        assert errmsg in e.details()
        assert e.code() is grpc.StatusCode.INTERNAL
    job.stop()
    job.wait_for_state(goal_state='KILLED')
    assert failed is True
 def get_job_info(self):
     request = job.GetRequest(id=peloton.JobID(value=self.job_id))
     resp = self.client.job_svc.Get(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=default_timeout,
     )
     return resp.jobInfo
Beispiel #8
0
 def get_pod_events(self, instance_id):
     """
     :return: A list of all of the pod events of a given task
     """
     request = task.GetPodEventsRequest(
         jobId=peloton.JobID(value=self.job_id), instanceId=instance_id)
     resp = self.client.task_svc.GetPodEvents(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     return resp.result
Beispiel #9
0
 def list_tasks(self):
     """
     :return: The map of instance ID to task info for all matching tasks
     """
     request = task.ListRequest(jobId=peloton.JobID(value=self.job_id))
     resp = self.client.task_svc.List(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     assert not resp.HasField("notFound")
     return resp.result
Beispiel #10
0
 def get_info(self):
     """
     :return: The job info
     """
     request = job.GetRequest(id=peloton.JobID(value=self.job_id))
     resp = self.client.job_svc.Get(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     assert not resp.HasField("error")
     return resp.jobInfo
Beispiel #11
0
def query_request(
    job_id, task_state=None, pagination=None, names=None, hosts=None
):
    """ Constructs a task.QueryRequest object for task query api. """
    job_id_spec = peloton.JobID(value=job_id)
    task_states = [task_state] if task_state else []
    pagination = pagination if pagination else default_pagination
    task_query_spec = task.QuerySpec(
        taskStates=task_states, pagination=pagination, names=names, hosts=hosts
    )

    request = task.QueryRequest(jobId=job_id_spec, spec=task_query_spec)
    return request
Beispiel #12
0
    def create(self, config_version=None):
        """
        creates an update based on the job and config.
        if config_version is provided, create will use the provided value,
        and raise an exception if version is wrong.
        if config_version is provided, create will query job runtime to
        get config version and retry until version is correct.
        :return: the update ID
        """
        respool_id = self.pool.ensure_exists()
        self.updated_job_config.respoolID.value = respool_id

        while True:
            job_config_version = self.job.get_runtime().configurationVersion
            self.updated_job_config.changeLog.version = (config_version
                                                         or job_config_version)

            request = update_svc.CreateUpdateRequest(
                jobId=peloton.JobID(value=self.job.job_id),
                jobConfig=self.updated_job_config,
                updateConfig=update.UpdateConfig(
                    batchSize=self.batch_size,
                    rollbackOnFailure=self.roll_back_on_failure,
                    maxInstanceAttempts=self.max_instance_attempts,
                    maxFailureInstances=self.max_failure_instances,
                    startPaused=self.start_paused,
                ),
            )
            try:
                resp = self.client.update_svc.CreateUpdate(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if (e.code() == grpc.StatusCode.ABORTED
                        and e.details() == INVALID_VERSION_ERR_MESSAGE
                        and config_version is None):
                    continue
                raise
            break

        assert resp.updateID.value
        self.workflow = Workflow(resp.updateID.value,
                                 client=self.client,
                                 config=self.config)
        log.info("created update %s", self.workflow.workflow_id)
Beispiel #13
0
 def get_task_runs(self, instance_id):
     """
     :param instance_id: The instance id of the task
     :return: Returns all active and completed tasks of the given instance
     """
     request = task.GetRequest(jobId=peloton.JobID(value=self.job_id),
                               instanceId=instance_id)
     resp = self.client.task_svc.Get(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     assert not resp.HasField("notFound")
     assert not resp.HasField("outOfRange")
     return resp.results
Beispiel #14
0
 def get_task_info(self, instance_id):
     """
     :param instance_id: The instance id of the task
     :return: The task info for the instance id
     """
     request = task.GetRequest(jobId=peloton.JobID(value=self.job_id),
                               instanceId=instance_id)
     resp = self.client.task_svc.Get(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     assert not resp.HasField("notFound")
     assert not resp.HasField("outOfRange")
     return resp.result
Beispiel #15
0
def test__delete_completed_job(jobs_by_state):

    job = jobs_by_state[1]["SUCCEEDED"][0]
    job.create()
    job.wait_for_state()

    client = Client()
    request = job_pb2.DeleteRequest(id=peloton.JobID(value=job.job_id))
    try:
        client.job_svc.Delete(request,
                              metadata=client.jobmgr_metadata,
                              timeout=10)
    except grpc.RpcError as e:
        log.info(e)
        assert e is None
Beispiel #16
0
    def delete(self):
        """
        Deletes a job
        :return: delete job response from the API
        """
        request = job.DeleteRequest(id=peloton.JobID(value=self.job_id))
        response = self.client.job_svc.Delete(
            request,
            metadata=self.client.jobmgr_metadata,
            timeout=self.config.rpc_timeout_sec,
        )
        assert not response.HasField("error")

        log.info("deleting job {0}".format(self.job_id))
        return response
Beispiel #17
0
def test__delete_non_existing_job(peloton_client):
    client = peloton_client
    request = job_pb2.DeleteRequest(id=peloton.JobID(
        value="00010203-0405-0607-0809-0a0b0c0d0e0f"))
    failed = True
    try:
        client.job_svc.Delete(request,
                              metadata=client.jobmgr_metadata,
                              timeout=10)
        failed = False
    except grpc.RpcError as e:
        log.info(e)
        assert e.details() == "job not found"
        assert e.code() is grpc.StatusCode.NOT_FOUND
    assert failed is True
Beispiel #18
0
 def stop(self, ranges=None):
     """
     Stops a job or certain tasks based on the ranges
     :param ranges: the instance ranges to stop
     :return: task stop response from the API
     """
     request = task.StopRequest(jobId=peloton.JobID(value=self.job_id),
                                ranges=ranges)
     response = self.client.task_svc.Stop(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     log.info("stopping tasks in job {0} with ranges {1}".format(
         self.job_id, ranges))
     return response
Beispiel #19
0
 def browse_task_sandbox(self, instance_id, task_id):
     """
     :param instance_id: The instance id of the task
     :param task_id: The mesos task id of the task
     :return: The BrowseSandboxResponse
     """
     request = task.BrowseSandboxRequest(
         jobId=peloton.JobID(value=self.job_id),
         instanceId=instance_id,
         taskId=task_id)
     resp = self.client.task_svc.BrowseSandbox(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     assert not resp.HasField('error')
     return resp
Beispiel #20
0
    def get_tasks(self, job_id):
        """
        param job_id: id of the job
        type job_id: str

        rtype: job.ListResponse
        """
        request = task.ListRequest(jobId=peloton.JobID(value=job_id))
        try:
            resp = self.client.task_svc.List(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            ).result.value
            return resp
        except Exception as e:
            print_fail("Exception calling List Tasks :%s" % str(e))
            raise
Beispiel #21
0
    def get_job(self, job_id):
        """
        :param job_id: the id of the job
        :type job_id: str

        :rtype: Response
        """
        request = job.GetRequest(id=peloton.JobID(value=job_id))
        try:
            resp = self.client.job_svc.Get(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception as e:
            print_fail("Exception calling Get job :%s" % str(e))
            raise
Beispiel #22
0
    def delete_job(self, job_id):
        """
        param job_id: id of the job
        type job_id: str

        rtype: job.DeleteResponse
        """
        request = job.DeleteRequest(id=peloton.JobID(value=job_id))
        try:
            print_okblue("Deleting job %s" % job_id)
            resp = self.client.job_svc.Delete(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception as e:
            print_fail("Exception calling delete job :%s" % str(e))
            raise
Beispiel #23
0
    def stop_job(self, job_id):
        """
        param job_id: id of the job
        type job_id: str

        rtype: job.StopResponse
        """
        request = task.StopRequest(jobId=peloton.JobID(value=job_id))
        try:
            print_okblue("Killing all tasks of Job %s" % job_id)
            resp = self.client.task_svc.Stop(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception as e:
            print_fail("Exception calling List Tasks :%s" % str(e))
            raise
Beispiel #24
0
    def wait_for_terminated(self):
        """
        Waits for the job to be terminated
        """
        state = ''
        attempts = 0
        log.info('%s waiting for terminal state', self.job_id)
        terminated = False
        while attempts < self.config.max_retry_attempts:
            try:
                request = job.GetRequest(
                    id=peloton.JobID(value=self.job_id),
                )
                resp = self.client.job_svc.Get(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
                runtime = resp.jobInfo.runtime
                new_state = job.JobState.Name(runtime.state)
                if state != new_state:
                    log.info('%s transitioned to state %s', self.job_id,
                             new_state)
                state = new_state
                if state in ['SUCCEEDED', 'FAILED', 'KILLED']:
                    terminated = True
                    break
                log.debug(format_stats(runtime.taskStats))
            except Exception as e:
                log.warn(e)
            finally:
                time.sleep(self.config.sleep_time_sec)
                attempts += 1
        if terminated:
            log.info('%s job terminated', self.job_id)
            assert True

        if attempts == self.config.max_retry_attempts:
            log.info('% max attempts reached to wait for goal state',
                     self.job_id)
            log.info('current_state:%s', state)
            assert False
Beispiel #25
0
    def update(self, new_job_file):
        """
        updates a job
        :param new_job_file: The job config file used for updating
        """
        job_config_dump = load_test_config(new_job_file)
        new_job_config = job.JobConfig()
        json_format.ParseDict(job_config_dump, new_job_config)

        request = job.UpdateRequest(id=peloton.JobID(value=self.job_id),
                                    config=new_job_config)
        resp = self.client.job_svc.Update(
            request,
            metadata=self.client.jobmgr_metadata,
            timeout=self.config.rpc_timeout_sec,
        )
        assert not resp.HasField("error")

        # update the config
        self.job_config = new_job_config
        log.info("updated job %s", self.job_id)
Beispiel #26
0
 def wait_for_leader(self):
     """
     utility method to wait for job manger leader to come up.
     good practice to check before all write apis
     """
     attempts = 0
     while attempts < self.config.max_retry_attempts:
         try:
             request = job.DeleteRequest(id=peloton.JobID(
                 value="dummy_job_id"))
             self.client.job_svc.Delete(
                 request,
                 metadata=self.client.jobmgr_metadata,
                 timeout=self.config.rpc_timeout_sec,
             )
         except grpc.RpcError as e:
             if e.code() == grpc.StatusCode.UNAVAILABLE:
                 time.sleep(self.config.sleep_time_sec)
                 attempts += 1
                 continue
         break
Beispiel #27
0
 def start(self, ranges=None):
     """
     Starts a job or certain tasks based on the ranges
     :param ranges: the instance ranges to start
     :return: task start response from the API
     """
     # wait for job manager leader
     self.wait_for_jobmgr_available()
     request = task.StartRequest(
         jobId=peloton.JobID(value=self.job_id), ranges=ranges
     )
     response = self.client.task_svc.Start(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     log.info(
         "starting tasks in job {0} with ranges {1}".format(
             self.job_id, ranges
         )
     )
     return response
Beispiel #28
0
    def update_job(self, job_id, new_job_config):
        """
        param job_id: id of the job
        param new_job_config: new config of the job
        type job_id: str
        type new_job_config: job.JobConfig

        rtype: job.UpdateResponse
        """
        request = job.UpdateRequest(
            id=peloton.JobID(value=job_id),
            config=new_job_config,
        )
        try:
            print_okblue("Updating Job %s" % job_id)
            resp = self.client.job_svc.Update(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception, e:
            print_fail('Exception calling Update Job: %s' % str(e))
            raise
Beispiel #29
0
    def wait_for_state(self, goal_state="SUCCEEDED", failed_state="FAILED"):
        """
        Waits for the job to reach a particular state
        :param goal_state: The state to reach
        :param failed_state: The failed state of the job
        """
        state = ""
        attempts = 0
        start = time.time()
        log.info("%s waiting for state %s", self.job_id, goal_state)
        state_transition_failure = False
        while attempts < self.config.max_retry_attempts:
            try:
                request = job.GetRequest(id=peloton.JobID(value=self.job_id))
                resp = self.client.job_svc.Get(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
                runtime = resp.jobInfo.runtime
                new_state = job.JobState.Name(runtime.state)
                if state != new_state:
                    log.info("%s transitioned to state %s", self.job_id,
                             new_state)
                state = new_state
                if state == goal_state:
                    break
                log.debug(format_stats(runtime.taskStats))
                # If we assert here, we will log the exception,
                # and continue with the finally block. Set a flag
                # here to indicate failure and then break the loop
                # in the finally block
                if state == failed_state:
                    state_transition_failure = True
            except Exception as e:
                log.warn(e)
            finally:
                if state_transition_failure:
                    break
                time.sleep(self.config.sleep_time_sec)
                attempts += 1

        if state_transition_failure:
            task_failure = self._get_task_failure()
            log.info(
                "%s goal_state:%s current_state:%s attempts: %s",
                self.job_id,
                goal_state,
                state,
                str(attempts),
            )
            for t, failure in task_failure.iteritems():
                log.info(
                    "%s task id:%s failed for reason:%s message:%s",
                    self.job_id,
                    t,
                    failure["reason"],
                    failure["message"],
                )
            assert False

        if attempts == self.config.max_retry_attempts:
            log.info("%s max attempts reached to wait for goal state",
                     self.job_id)
            log.info(
                "%s goal_state:%s current_state:%s",
                self.job_id,
                goal_state,
                state,
            )
            assert False

        end = time.time()
        elapsed = end - start
        log.info("%s state transition took %s seconds", self.job_id, elapsed)
        assert state == goal_state