コード例 #1
0
    def stop_job(self):
        while True:
            # first get the entity version
            job_info = self.get_job_info()
            version = job_info.status.version.value

            request = stateless_svc.StopJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(value=version),
            )
            try:
                self.client.stateless_svc.StopJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=default_timeout,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, just retry
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                ):
                    continue
                raise
            break
コード例 #2
0
    def resume(self, entity_version=None):
        """
        resume the given update
        """
        job_entity_version = entity_version or \
            self.job.entity_version or \
            self.job.get_status().version.value

        while True:
            request = stateless_svc.ResumeJobWorkflowRequest(
                job_id=v1alpha_peloton.JobID(value=self.job.job_id),
                version=v1alpha_peloton.EntityVersion(value=job_entity_version),
            )
            try:
                resp = self.client.stateless_svc.ResumeJobWorkflow(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if e.code() == grpc.StatusCode.INVALID_ARGUMENT \
                        and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \
                        and entity_version is None:
                    job_entity_version = entity_version or \
                                         self.job.get_status().version.value
                    continue
                raise
            break
        self.job.entity_version = resp.version.value
        log.info('job workflow resumed: %s', self.job.entity_version)
コード例 #3
0
    def create(self, in_place=False, entity_version=None):
        """
        replace the job spec with the spec provided in StatelessUpdate
        if entity_version is provided,  replace will use the provided value,
        and raise an exception if version is wrong.
        if entity_version is not provided, replace will query job runtime to
        get config version and retry until version is correct.
        :return: the update ID
        """
        # wait for job manager leader
        self.job.wait_for_jobmgr_available()

        respool_id = self.pool.ensure_exists()
        self.updated_job_spec.respool_id.value = respool_id

        job_entity_version = (entity_version or self.job.entity_version
                              or self.job.get_status().version.value)

        while True:
            request = stateless_svc.ReplaceJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version),
                spec=self.updated_job_spec,
                update_spec=stateless.UpdateSpec(
                    batch_size=self.batch_size,
                    rollback_on_failure=self.roll_back_on_failure,
                    max_instance_retries=self.max_instance_attempts,
                    max_tolerable_instance_failures=self.max_failure_instances,
                    start_paused=self.start_paused,
                    in_place=in_place,
                ),
            )
            try:
                resp = self.client.stateless_svc.ReplaceJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if (e.code() == grpc.StatusCode.ABORTED
                        and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                        and entity_version is None):
                    job_entity_version = (entity_version or
                                          self.job.get_status().version.value)
                    continue
                raise
            break
        self.job.entity_version = resp.version.value
        log.info(
            "job spec replaced with new entity version: %s",
            self.job.entity_version,
        )
コード例 #4
0
    def update_job(
        self,
        instance_inc,
        batch_size,
        use_instance_config,
        sleep_time,
        host_limit_1=False,
    ):
        default_config = self.create_pod_config(
            sleep_time, "static", host_limit_1=host_limit_1)
        job_spec = create_stateless_job_spec(
            "instance %s && sleep %s" % (instance_inc, sleep_time),
            [
                v1alpha_peloton.Label(key="task_num", value=str(instance_inc)),
                v1alpha_peloton.Label(key="sleep_time", value=str(sleep_time)),
            ],
            instance_inc,
            default_config,
            self.respool_id,
        )
        update_spec = stateless.UpdateSpec(batch_size=batch_size)

        while True:
            # first get the entity version
            job_info = self.get_job_info()
            version = job_info.status.version.value
            job_spec.instance_count = (
                job_info.spec.instance_count + instance_inc
            )

            request = stateless_svc.ReplaceJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(value=version),
                spec=job_spec,
                update_spec=update_spec,
            )
            try:
                resp = self.client.stateless_svc.ReplaceJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=default_timeout,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, just retry
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                ):
                    continue
                raise
            break
        return resp
コード例 #5
0
ファイル: stateless_job.py プロジェクト: daimazai/peloton
    def delete(self, entity_version=None, force_delete=False):
        """
        Delete the job

        :param entity_version: the entity version of the job, for concurrency control.
            If entity_version is provided,  start will use the provided value,
            and raise an exception if version is wrong.
            if entity_version is not provided, start will query job runtime to
            get config version and retry until version is correct.
        :param force_delete: force delete a job.  If set to true, it will force
            a delete of the job even if it is running.The job will be first
            stopped and deleted. This step cannot be undone, and the job cannot
            be re-created (with same uuid) till the delete is complete.
        """
        job_entity_version = (
            entity_version
            or self.entity_version
            or self.get_status().version.value
        )

        while True:
            request = stateless_svc.DeleteJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version
                ),
                force=force_delete,
            )
            try:
                self.client.stateless_svc.DeleteJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, get entity version from job status
                # and try again.
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                    and entity_version is None
                ):
                    job_entity_version = (
                        entity_version or self.get_status().version.value
                    )
                    continue
                raise
            break
        log.info("job %s deleted", self.job_id)
コード例 #6
0
ファイル: stateless_job.py プロジェクト: daimazai/peloton
    def restart(
        self, entity_version=None, batch_size=None, ranges=None, in_place=False
    ):
        """
        Restart pods based on the ranges.
        If ranges is not provided then it restarts all pods of the job

        :return: restart response from the API
        """
        job_entity_version = (
            entity_version
            or self.entity_version
            or self.get_status().version.value
        )

        while True:
            request = stateless_svc.RestartJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version
                ),
                restart_spec=stateless.RestartSpec(
                    batch_size=batch_size, ranges=ranges, in_place=in_place
                ),
            )
            try:
                resp = self.client.stateless_svc.RestartJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, get entity version from job status
                # and try again.
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                    and entity_version is None
                ):
                    job_entity_version = (
                        entity_version or self.get_status().version.value
                    )
                    continue
                raise
            break
        self.entity_version = resp.version.value
        log.info("job restarted, new entity version: %s", self.entity_version)
        return resp
コード例 #7
0
ファイル: stateless_job.py プロジェクト: zhaohc10/peloton
    def get_replace_job_diff(self, entity_version=None, job_spec=None):
        """
        :return: get replace job diff response.
        """
        job_entity_version = entity_version or \
            self.entity_version or \
            self.get_status().version.value

        request = stateless_svc.GetReplaceJobDiffRequest(
            job_id=v1alpha_peloton.JobID(value=self.job_id),
            version=v1alpha_peloton.EntityVersion(value=job_entity_version),
            spec=job_spec,
        )
        resp = self.client.stateless_svc.GetReplaceJobDiff(
            request,
            metadata=self.client.jobmgr_metadata,
            timeout=self.config.rpc_timeout_sec,
        )
        return resp
コード例 #8
0
ファイル: stateless_job.py プロジェクト: zhaohc10/peloton
 def wait_for_jobmgr_available(self):
     """
     utility method to wait for job manger leader to come up.
     good practice to check before all write apis
     """
     attempts = 0
     while attempts < self.config.max_retry_attempts:
         try:
             request = stateless_svc.DeleteJobRequest(
                 job_id=v1alpha_peloton.JobID(value=self.job_id),
                 version=v1alpha_peloton.EntityVersion(
                     value="dummy-entity-version"),
             )
             self.client.stateless_svc.DeleteJob(
                 request,
                 metadata=self.client.jobmgr_metadata,
                 timeout=self.config.rpc_timeout_sec,
             )
         except grpc.RpcError as e:
             if e.code() != grpc.StatusCode.UNAVAILABLE:
                 break
         log.info("waiting for job manager leader")
         time.sleep(self.config.sleep_time_sec)
         attempts += 1
コード例 #9
0
    def pause(self, entity_version=None):
        """
        pause the given update
        """
        # wait for job manager leader
        self.job.wait_for_jobmgr_available()

        job_entity_version = (entity_version or self.job.entity_version
                              or self.job.get_status().version.value)

        while True:
            request = stateless_svc.PauseJobWorkflowRequest(
                job_id=v1alpha_peloton.JobID(value=self.job.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version),
            )
            try:
                resp = self.client.stateless_svc.PauseJobWorkflow(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if (e.code() == grpc.StatusCode.ABORTED
                        and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                        and entity_version is None):
                    job_entity_version = (entity_version or
                                          self.job.get_status().version.value)
                    continue
                raise
            break
        self.job.entity_version = resp.version.value
        log.info("job workflow paused: %s", self.job.entity_version)
コード例 #10
0
ファイル: stateless_job.py プロジェクト: zhaohc10/peloton
    def stop(self, ranges=None, entity_version=None):
        """
        Stops certain pods based on the ranges.
        If ranges is not provided then it stops the job

        Job level stop does not support range.
        We are using pod api for range operation.
        We do this for backward compatibility of existing tests

        :param ranges: the instance ranges to stop
        :param entity_version: the entity version of the job, for concurrency control.
            If entity_version is provided, stop will use the provided value,
            and raise an exception if version is wrong.
            if entity_version is not provided, stop will query job runtime to
            get config version and retry until version is correct.
        :return: stop response from the API
        """
        # wait for job manager leader
        self.wait_for_jobmgr_available()
        if ranges is None:
            job_entity_version = (entity_version or self.entity_version
                                  or self.get_status().version.value)

            while True:
                request = stateless_svc.StopJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id),
                    version=v1alpha_peloton.EntityVersion(
                        value=job_entity_version),
                )
                try:
                    resp = self.client.stateless_svc.StopJob(
                        request,
                        metadata=self.client.jobmgr_metadata,
                        timeout=self.config.rpc_timeout_sec,
                    )
                except grpc.RpcError as e:
                    # if entity version is incorrect, get entity version from job status
                    # and try again.
                    if (e.code() == grpc.StatusCode.ABORTED and
                            INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                            and entity_version is None):
                        job_entity_version = (entity_version or
                                              self.get_status().version.value)
                        continue
                    raise
                break
            self.entity_version = resp.version.value
            log.info("job stopped, new entity version: %s",
                     self.entity_version)
            return resp

        for pod_range in ranges:
            for pod_id in range(getattr(pod_range, "from"), pod_range.to):
                pod_name = self.job_id + "-" + str(pod_id)
                request = pod_svc.StopPodRequest(
                    pod_name=v1alpha_peloton.PodName(value=pod_name))
                self.client.pod_svc.StopPod(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )

        log.info("stopping pods in job {0} with ranges {1}".format(
            self.job_id, ranges))
        return pod_svc.StopPodResponse()
コード例 #11
0
    def start(self, ranges=None, entity_version=None):
        """
        Starts certain pods based on the ranges.
        If ranges is not provided it starts all pods of the job

        Job level start does not support range.
        We are using pod api for range operation.
        We do this for backward compatibility of existing tests

        :param ranges: the instance ranges to start
        :param entity_version: the entity version of the job, for concurrency control.
            If entity_version is provided, start will use the provided value,
            and raise an exception if version is wrong.
            if entity_version is not provided, start will query job runtime to
            get config version and retry until version is correct.
        :return: start response from the API
        """
        if ranges is None:
            job_entity_version = entity_version or \
                self.entity_version or \
                self.get_status().version.value

            while True:
                request = stateless_svc.StartJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id),
                    version=v1alpha_peloton.EntityVersion(
                        value=job_entity_version),
                )
                try:
                    resp = self.client.stateless_svc.StartJob(
                        request,
                        metadata=self.client.jobmgr_metadata,
                        timeout=self.config.rpc_timeout_sec,
                    )
                except grpc.RpcError as e:
                    # if entity version is incorrect, get entity version from job status
                    # and try again.
                    if e.code() == grpc.StatusCode.INVALID_ARGUMENT \
                            and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \
                            and entity_version is None:
                        job_entity_version = entity_version or \
                            self.get_status().version.value
                        continue
                    raise
                break
            self.entity_version = resp.version.value
            log.info('job started, new entity version: %s',
                     self.entity_version)
            return resp

        for pod_range in ranges:
            for pod_id in range(getattr(pod_range, 'from'), pod_range.to):
                pod_name = self.job_id + '-' + str(pod_id)
                request = pod_svc.StartPodRequest(
                    pod_name=v1alpha_peloton.PodName(value=pod_name), )
                self.client.pod_svc.StartPod(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )

        log.info('starting pods in job {0} with ranges {1}'.format(
            self.job_id, ranges))
        return pod_svc.StartPodResponse()