def inspect_pipeline( self, pipeline_name: str, history: int = 0, details: bool = False) -> Iterator[pps_proto.PipelineInfo]: """.. # noqa: W505 Inspects a pipeline. Parameters ---------- pipeline_name : str The name of the pipeline. history : int, optional Indicates to return historical versions of `pipeline_name`. Semantics are: - 0: Return current version of `pipeline_name` - 1: Return the above and `pipeline_name` from the next most recent version. - 2: etc. - -1: Return all historical versions of `pipeline_name`. details : bool, optional If true, return pipeline details. Returns ------- Iterator[pps_proto.PipelineInfo] An iterator of protobuf objects that contain info on a pipeline. Examples -------- >>> pipeline = next(client.inspect_pipeline("foo")) ... >>> for p in client.inspect_pipeline("foo", 2): >>> print(p) """ if history == 0: return iter([ self._req( Service.PPS, "InspectPipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), details=details, ) ]) else: # `InspectPipeline` doesn't support history, but `ListPipeline` # with a pipeline filter does, so we use that here return self._req( Service.PPS, "ListPipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), history=history, details=details, )
def inspect_datum(self, pipeline_name: str, job_id: str, datum_id: str) -> pps_proto.DatumInfo: """Inspects a datum. Parameters ---------- pipeline_name : str The name of the pipeline. job_id : str The ID of the job. datum_id : str The ID of the datum. Returns ------- pps_proto.DatumInfo A protobuf object with info on the datum. """ return self._req( Service.PPS, "InspectDatum", datum=pps_proto.Datum( id=datum_id, job=pps_proto.Job( pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id), ), )
def test_create_pipeline_from_request(): client = python_pachyderm.Client() repo_name = util.create_test_repo(client, "test_create_pipeline_from_request") pipeline_name = util.test_repo_name("test_create_pipeline_from_request") # more or less a copy of the opencv demo's edges pipeline spec client.create_pipeline_from_request( pps_proto.CreatePipelineRequest( pipeline=pps_proto.Pipeline(name=pipeline_name), description="A pipeline that performs image edge detection by using the OpenCV library.", input=pps_proto.Input( pfs=pps_proto.PFSInput( glob="/*", repo=repo_name, ), ), transform=pps_proto.Transform( cmd=["echo", "hi"], image="pachyderm/opencv", ), ) ) assert any(p.pipeline.name == pipeline_name for p in list(client.list_pipeline()))
def get_job_logs( self, pipeline_name: str, job_id: str, data_filters: List[str] = None, datum: pps_proto.Datum = None, follow: bool = False, tail: int = 0, use_loki_backend: bool = False, since: duration_pb2.Duration = None, ) -> Iterator[pps_proto.LogMessage]: """Gets logs for a job. Parameters ---------- pipeline_name : str The name of the pipeline. job_id : str The ID of the job. data_filters : List[str], optional A list of the names of input files from which we want processing logs. This may contain multiple files, in case `pipeline_name` contains multiple inputs. Each filter may be an absolute path of a file within a repo, or it may be a hash for that file (to search for files at specific versions). datum : pps_proto.Datum, optional Filters log lines for the specified datum. follow : bool, optional If true, continue to follow new logs as they appear. tail : int, optional If nonzero, the number of lines from the end of the logs to return. Note: tail applies per container, so you will get `tail` * <number of pods> total lines back. use_loki_backend : bool, optional If true, use loki as a backend, rather than Kubernetes, for fetching logs. Requires a loki-enabled cluster. since : duration_pb2.Duration, optional Specifies how far in the past to return logs from. Returns ------- Iterator[pps_proto.LogMessage] An iterator of protobuf objects that contain info on a log from a PPS worker. If `follow` is set to ``True``, use ``next()`` to iterate through as the returned stream is potentially endless. Might block your code otherwise. """ return self._req( Service.PPS, "GetLogs", job=pps_proto.Job(pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id), data_filters=data_filters, datum=datum, follow=follow, tail=tail, use_loki_backend=use_loki_backend, since=since, )
def inspect_job( self, job_id: str, pipeline_name: str = None, wait: bool = False, details: bool = False, ) -> Iterator[pps_proto.JobInfo]: """Inspects a job. Parameters ---------- job_id : str The ID of the job. pipeline_name : str, optional The name of a pipeline. wait : bool, optional If true, wait until the job completes. details : bool, optional If true, return worker details. Returns ------- Iterator[pps_proto.JobInfo] An iterator of protobuf objects that contain info on a subjob (jobs at the pipeline-level). Examples -------- >>> # Look at all subjobs in a job >>> subjobs = list(client.inspect_job("467c580611234cdb8cc9758c7aa96087")) ... >>> # Look at single subjob (job at the pipeline-level) >>> subjob = list(client.inspect_job("467c580611234cdb8cc9758c7aa96087", "foo"))[0] .. # noqa: W505 """ if pipeline_name is not None: return iter([ self._req( Service.PPS, "InspectJob", job=pps_proto.Job( pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id), wait=wait, details=details, ) ]) else: return self._req( Service.PPS, "InspectJobSet", job_set=pps_proto.JobSet(id=job_id), wait=wait, details=details, )
def stop_pipeline(self, pipeline_name: str) -> None: """Stops a pipeline. Parameters ---------- pipeline_name : str The name of the pipeline. """ self._req(Service.PPS, "StopPipeline", pipeline=pps_proto.Pipeline(name=pipeline_name))
def check_pipeline_spec(req): assert req == pps_proto.CreatePipelineRequest( pipeline=pps_proto.Pipeline(name="foobar"), description="A pipeline that performs image edge detection by using the OpenCV library.", input=pps_proto.Input( pfs=pps_proto.PFSInput(glob="/*", repo="images"), ), transform=pps_proto.Transform( cmd=["python3", "/edges.py"], image="pachyderm/opencv", ), )
def run_cron(self, pipeline_name: str) -> None: """Triggers a cron pipeline to run now. For more info on cron pipelines: https://docs.pachyderm.com/latest/concepts/pipeline-concepts/pipeline/cron/ Parameters ---------- pipeline_name : str The name of the pipeline. """ self._req( Service.PPS, "RunCron", pipeline=pps_proto.Pipeline(name=pipeline_name), )
def delete_job(self, job_id: str, pipeline_name: str) -> None: """Deletes a subjob (job at the pipeline-level). Parameters ---------- job_id : str The ID of the job. pipeline_name : str The name of the pipeline. """ self._req( Service.PPS, "DeleteJob", job=pps_proto.Job(pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id), )
def list_datum( self, pipeline_name: str = None, job_id: str = None, input: pps_proto.Input = None, ) -> Iterator[pps_proto.DatumInfo]: """Lists datums. Exactly one of (`pipeline_name`, `job_id`) (real) or `input` (hypothetical) must be set. Parameters ---------- pipeline_name : str, optional The name of the pipeline. job_id : str, optional The ID of a job. input : pps_proto.Input, optional A protobuf object that filters the datums returned. The datums listed are ones that would be run if a pipeline was created with the provided input. Returns ------- Iterator[pps_proto.DatumInfo] An iterator of protobuf objects that contain info on a datum. Examples -------- >>> # See hypothetical datums with specified input cross >>> datums = list(client.list_datum(input=pps_proto.Input( ... pfs=pps_proto.PFSInput(repo="foo", branch="master", glob="/*"), ... cross=[ ... pps_proto.Input(pfs=pps_proto.PFSInput(repo="bar", branch="master", glob="/")), ... pps_proto.Input(pfs=pps_proto.PFSInput(repo="baz", branch="master", glob="/*/*")), ... ] ... ))) .. # noqa: W505 """ req = pps_proto.ListDatumRequest() if pipeline_name is not None and job_id is not None: req.job.CopyFrom( pps_proto.Job(pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id)) else: req.input.CopyFrom(input) return self._req(Service.PPS, "ListDatum", req=req)
def delete_pipeline(self, pipeline_name: str, force: bool = False, keep_repo: bool = False) -> None: """Deletes a pipeline. Parameters ---------- pipeline_name : str The name of the pipeline. force : bool, optional If true, forces the pipeline deletion. keep_repo : bool, optional If true, keeps the output repo. """ self._req( Service.PPS, "DeletePipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), force=force, keep_repo=keep_repo, )
def stop_job(self, job_id: str, pipeline_name: str, reason: str = None) -> None: """Stops a subjob (job at the pipeline-level). Parameters ---------- job_id : str The ID of the job. pipeline_name : str The name of the pipeline. reason : str, optional A reason for stopping the job. """ self._req( Service.PPS, "StopJob", job=pps_proto.Job(pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id), reason=reason, )
def restart_datum(self, pipeline_name: str, job_id: str, data_filters: List[str] = None) -> None: """Restarts a datum. Parameters ---------- pipeline_name : str The name of the pipeline. job_id : str The ID of the job. data_filters : List[str], optional A list of paths or hashes of datums that filter which datums are restarted. """ self._req( Service.PPS, "RestartDatum", job=pps_proto.Job(pipeline=pps_proto.Pipeline(name=pipeline_name), id=job_id), data_filters=data_filters, )
def list_job( self, pipeline_name: str = None, input_commit: SubcommitType = None, history: int = 0, details: bool = False, jqFilter: str = None, ) -> Union[Iterator[pps_proto.JobInfo], Iterator[pps_proto.JobSetInfo]]: """Lists jobs. Parameters ---------- pipeline_name : str, optional The name of a pipeline. If set, returns subjobs (job at the pipeline-level) only from this pipeline. input_commit : SubcommitType, optional A commit or list of commits from the input repo to filter jobs on. Only impacts returned results if `pipeline_name` is specified. history : int, optional Indicates to return jobs from historical versions of `pipeline_name`. Semantics are: - 0: Return jobs from the current version of `pipeline_name` - 1: Return the above and jobs from the next most recent version - 2: etc. - -1: Return jobs from all historical versions of `pipeline_name` details : bool, optional If true, return pipeline details for `pipeline_name`. Leaving this ``None`` (or ``False``) can make the call significantly faster in clusters with a large number of pipelines and jobs. Note that if `input_commit` is valid, this field is coerced to `True`. jqFilter : str, optional A ``jq`` filter that can filter the list of jobs returned, only if `pipeline_name` is provided. Returns ------- Union[Iterator[pps_proto.JobInfo], Iterator[pps_proto.JobSetInfo]] An iterator of protobuf objects that either contain info on a subjob (job at the pipeline-level), if `pipeline_name` was specified, or a job, if `pipeline_name` wasn't specified. Examples -------- >>> # List all jobs >>> jobs = list(client.list_job()) ... >>> # List all jobs at a pipeline-level >>> subjobs = list(client.list_job("foo")) .. # noqa: W505 """ if pipeline_name is not None: if isinstance(input_commit, list): input_commit = [commit_from(ic) for ic in input_commit] elif input_commit is not None: input_commit = [commit_from(input_commit)] return self._req( Service.PPS, "ListJob", pipeline=pps_proto.Pipeline(name=pipeline_name), input_commit=input_commit, history=history, details=details, jqFilter=jqFilter, ) else: return self._req( Service.PPS, "ListJobSet", details=details, )
def create_pipeline( self, pipeline_name: str, transform: pps_proto.Transform, parallelism_spec: pps_proto.ParallelismSpec = None, egress: pps_proto.Egress = None, reprocess_spec: str = None, update: bool = False, output_branch_name: str = None, s3_out: bool = False, resource_requests: pps_proto.ResourceSpec = None, resource_limits: pps_proto.ResourceSpec = None, sidecar_resource_limits: pps_proto.ResourceSpec = None, input: pps_proto.Input = None, description: str = None, reprocess: bool = False, service: pps_proto.Service = None, datum_set_spec: pps_proto.DatumSetSpec = None, datum_timeout: duration_pb2.Duration = None, job_timeout: duration_pb2.Duration = None, salt: str = None, datum_tries: int = 3, scheduling_spec: pps_proto.SchedulingSpec = None, pod_patch: str = None, spout: pps_proto.Spout = None, spec_commit: pfs_proto.Commit = None, metadata: pps_proto.Metadata = None, autoscaling: bool = False, ) -> None: """Creates a pipeline. For info on the params, please refer to the pipeline spec document: http://docs.pachyderm.io/en/latest/reference/pipeline_spec.html Parameters ---------- pipeline_name : str The pipeline name. transform : pps_proto.Transform The image and commands run during pipeline execution. parallelism_spec : pps_proto.ParallelismSpec, optional Specifies how the pipeline is parallelized. egress : pps_proto.Egress, optional An external data store to publish the results of the pipeline to. reprocess_spec : str, optional Specifies how to handle already-processed datums. update : bool, optional If true, updates the existing pipeline with new args. output_branch_name : str, optional The branch name to output results on. s3_out : bool, optional If true, the output repo is exposed as an S3 gateway bucket. resource_requests : pps_proto.ResourceSpec, optional The amount of resources that the pipeline workers will consume. resource_limits: pps_proto.ResourceSpec, optional The upper threshold of allowed resources a given worker can consume. If a worker exceeds this value, it will be evicted. sidecar_resource_limits : pps_proto.ResourceSpec, optional The upper threshold of resources allocated to the sidecar containers. input : pps_proto.Input, optional The input repos to the pipeline. Commits to these repos will automatically trigger the pipeline to create new jobs to process them. description : str, optional A description of the pipeline. reprocess : bool, optional If true, forces the pipeline to reprocess all datums. Only has meaning if `update` is ``True``. service : pps_proto.Service, optional Creates a Service pipeline instead of a normal pipeline. datum_set_spec : pps_proto.DatumSetSpec, optional Specifies how a pipeline should split its datums into datum sets. datum_timeout : duration_pb2.Duration, optional The maximum execution time allowed for each datum. job_timeout : duration_pb2.Duration, optional The maximum execution time allowed for a job. salt : str, optional A tag for the pipeline. datum_tries : int, optional The number of times a job attempts to run on a datum when a failure occurs. scheduling_spec : pps_proto.SchedulingSpec, optional Specifies how the pods for a pipeline should be scheduled. pod_patch : str, optional Allows one to set fields in the pod spec that haven't been explicitly exposed in the rest of the pipeline spec. spout : pps_proto.Spout, optional Creates a Spout pipeline instead of a normal pipeline. spec_commit : pfs_proto.Commit, optional A spec commit to base the pipeline spec from. metadata : pps_proto.Metadata, optional Kubernetes labels and annotations to add as metadata to the pipeline pods. autoscaling : bool, optional If true, automatically scales the worker pool based on the datums it has to process. Notes ----- If creating a Spout pipeline, when committing data to the repo, use commit methods (``client.commit()``, ``client.start_commit()``, etc.) or :class:`.ModifyFileClient` methods (``mfc.put_file_from_bytes``, ``mfc.delete_file()``, etc.) For other pipelines, when committing data to the repo, write out to ``/pfs/out/``. Examples -------- >>> client.create_pipeline( ... "foo", ... transform=pps_proto.Transform( ... cmd=["python3", "main.py"], ... image="example/image", ... ), ... input=pps_proto.Input(pfs=pps_proto.PFSInput( ... repo="foo", ... branch="master", ... glob="/*" ... )) ... ) """ self._req( Service.PPS, "CreatePipeline", pipeline=pps_proto.Pipeline(name=pipeline_name), transform=transform, parallelism_spec=parallelism_spec, egress=egress, update=update, output_branch=output_branch_name, s3_out=s3_out, resource_requests=resource_requests, resource_limits=resource_limits, sidecar_resource_limits=sidecar_resource_limits, input=input, description=description, reprocess=reprocess, metadata=metadata, service=service, datum_set_spec=datum_set_spec, datum_timeout=datum_timeout, job_timeout=job_timeout, salt=salt, datum_tries=datum_tries, scheduling_spec=scheduling_spec, pod_patch=pod_patch, spout=spout, spec_commit=spec_commit, reprocess_spec=reprocess_spec, autoscaling=autoscaling, )