def run_pipeline_in_experiment(api_pipeline: ApiPipeline, parameters: dict = None, run_name: str = None, wait_for_status: bool = False): try: client = KfpClient(_pipeline_service_url) experiment = client.create_experiment('PIPELINE_RUNS') run_result = client.run_pipeline(experiment_id=experiment.id, job_name=run_name or api_pipeline.name, params=parameters, pipeline_id=api_pipeline.id) run_id = run_result.id if wait_for_status: run_details = wait_for_run_status(client, run_id, 10) run_status = json.loads(run_details.pipeline_runtime.workflow_manifest)["status"] if run_status \ and run_status.get("phase", "").lower() in ["failed", "error"] \ and run_status.get("message"): raise RuntimeError(f"Run {run_id} failed with error: {run_status['message']}") return run_id except Exception as e: print(f"Exception trying to run pipeline {api_pipeline.id} '{api_pipeline.name}'" f" with parameters {parameters}:" f" %s\n" % e) raise ApiError(message=f"{e.body}\nKFP URL: {_pipeline_service_url}", http_status_code=e.status or 422) return None
def get_pipeline(run_id, namespace=None): """Get Pipeline status :param run_id: id of pipelines run :param namespace: k8s namespace if not default :return: kfp run dict """ namespace = namespace or mlconf.namespace remote = not get_k8s_helper( silent=True).is_running_inside_kubernetes_cluster() if remote: mldb = get_run_db() if mldb.kind != "http": raise ValueError( "get pipeline require access to remote api-service" ", please set the dbpath url") resp = mldb.get_pipeline(run_id, namespace=namespace) else: client = Client(namespace=namespace) resp = client.get_run(run_id) if resp: resp = resp.to_dict() return resp
def upload_experiments( client: kfp.Client, pipeline_name: str, github_sha: str, experiment_name: str = "", ) -> str: """Function to upload an experiment to Kubeflow Pipelines. For clarity, the experiment will be registered to Kubeflow Pipelines named like below: {pipeline_name}-{experiment_name} If the experiment does not exist, it will be created newly with specified name. If the experiment is not specified, {pipeline_name}-default will be used. Args: client (kfp.Client) : KFP client. pipeline_name (str) : The name of the pipeline function. github_sha (str) : GitHub SHA generated in GitHub Actions. experiment_name (str) : The experiment name. (Optional) Returns: str : The ID of the experiment. """ register_name = (f"{pipeline_name}-{experiment_name}" if experiment_name != "Default" else experiment_name) try: experiment_id = client.get_experiment( experiment_name=register_name).to_dict()["id"] except ValueError: experiment_id = client.create_experiment( name=register_name).to_dict()["id"] logging.info(f"The experiment is newly registered : {register_name}") return experiment_id
def __init__(self, host: Optional[str] = None, client_id: Optional[str] = None, namespace: Optional[str] = "kubeflow"): """ Instandiate a new KubeflowClient Args: host (str): The host we can find the Kubeflow API at (e.g. https://{APP_NAME}.endpoints.{PROJECT_ID}.cloud.goog/pipeline) client_id (str): The IAP client id we can use for authorisate (e.g. "XXXXXX-XXXXXXXXX.apps.googleusercontent.com") namespace (str): The Kuberenetes / Kubeflow namespace to deploy to (e.g. kubeflow) """ self.host = host self.client_id = client_id self.namespace = namespace logging.info(f"KubeflowClient: host: {host}, client_id: {client_id}") self.kfp_client = Client(host, client_id, namespace) self.config = self.kfp_client._load_config(self.host, self.client_id, self.namespace, None, None) # print(f"kfp auth:") # print(f"\thost: {self.host}") # print(f"\tclient_id: {self.client_id}") # print(f"\tnamespace: {self.namespace}") # print(f"\tapi_key: {self.config.api_key}") self.kfp_pipelines = self._connect_pipelines_api() self.kfp_runs = self._connect_runs_api() self.kfp_jobs = self._connect_jobs_api()
def run_pipeline(client: kfp.Client, pipeline_name: str, pipeline_id: str, pipeline_paramters_path: dict): experiment_id = find_experiment_id( experiment_name=os.environ["INPUT_EXPERIMENT_NAME"], client=client) if not experiment_id: raise ValueError("Failed to find experiment with the name: {}".format( os.environ["INPUT_EXPERIMENT_NAME"])) logging.info(f"The expriment id is: {experiment_id}") namespace = None if (os.getenv("INPUT_PIPELINE_NAMESPACE") != None) and (str.isspace(os.getenv("INPUT_PIPELINE_NAMESPACE")) == False) and os.getenv("INPUT_PIPELINE_NAMESPACE"): namespace = os.environ["INPUT_PIPELINE_NAMESPACE"] logging.info(f"The namespace that will be used is: {namespace}") # [TODO] What would be a good way to name the jobs job_name = pipeline_name + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") logging.info(f"The job name is: {job_name}") pipeline_params = read_pipeline_params( pipeline_paramters_path=pipeline_paramters_path) pipeline_params = pipeline_params if pipeline_params != None else {} logging.info( f"experiment_id: {experiment_id}, job_name:{job_name}, pipeline_params:{pipeline_params}, pipeline_id:{pipeline_id}, namespace:{namespace}" ) client.run_pipeline( experiment_id=experiment_id, job_name=job_name, # Read this as a yaml, people seam to prefer that to json. params=pipeline_params, pipeline_id=pipeline_id, namespace=namespace) logging.info( "Successfully started the pipeline, head over to kubeflow to check it out" )
def __init__(self, config, project_name, context): token = AuthHandler().obtain_id_token() self.host = config.host self.client = Client(self.host, existing_token=token) self.project_name = project_name self.pipeline_description = config.run_config.description self.generator = PipelineGenerator(config, project_name, context)
def deploy_to_dev(pipeline): """ Deploy the Kubeflow Pipelines Pipeline (e.g. a method decorated with `@dsl.pipeline`) to Kubeflow and execute it. Args: pipeline (func): The `@dsl.pipeline` method describing the pipeline Returns: True if the deployment suceeds """ deploy_args = dict() pipeline_name = pipeline.__name__ experiment_name = f"{pipeline_name}_tests" run_name = pipeline_name + ' ' + datetime.now().strftime( '%Y-%m-%d %H-%M-%S') print(f"hm> pipeline: {pipeline_name}") print(f"hm> experiment: {experiment_name}") print(f"hm> run: {run_name}") client = Client(None, None) client.create_run_from_pipeline_func(pipeline, deploy_args, run_name=run_name, experiment_name=experiment_name) print(f"hm> Deployed and running!") return True
def upload_pipeline_to_kfp(uploadfile: str, name: str = None, description: str = None) -> ApiPipeline: kfp_client = KfpClient() try: kfp_pipeline: KfpPipeline = kfp_client.upload_pipeline( pipeline_package_path=uploadfile, pipeline_name=name, description=description) api_pipeline: ApiPipeline = ApiPipeline.from_dict( kfp_pipeline.to_dict()) api_pipeline.status = kfp_pipeline.error return api_pipeline except PipelineApiException as e: kfp_host = _pipeline_service_url print( f"Error calling PipelineServiceApi ({kfp_host}) -> upload_pipeline(name='{name}'): {e}" ) error_body = json.loads(e.body) or {"error_message": str(e)} error_msg = error_body["error_message"] status_code = 409 if "already exist. Please specify a new name" in error_msg else e.status raise ApiError(error_msg, status_code) return None
def upload_pipeline( client: kfp.Client, pipeline_zip_path: str, pipeline_name: str, github_sha: str, ) -> str: """Function to upload a pipeline to Kubeflow Pipelines. A zipped pipeline will be uploaded with GitHub SHA versioning. If the pipeline already exists in Kubeflow Pipelines, it will be just versioned with GitHub SHA. Args: client (kfp.Client) : KFP client. pipeline_zip_path (str) : A path to zipped pipeline file. pipeline_name (str) : The name of the pipeline function. This will be used as pipeline name. github_sha (str) : GitHub SHA generated in GitHub Actions. Returns: str : The ID of the pipeline. """ pipeline_id = client.get_pipeline_id(pipeline_name) if pipeline_id is None: pipeline_id = client.upload_pipeline( pipeline_package_path=pipeline_zip_path, pipeline_name=pipeline_name, ).to_dict()["id"] logging.info(f"The pipeline is newly registered : {pipeline_name}") else: # pipeline versioning with GitHub SHA client.upload_pipeline_version( pipeline_package_path=pipeline_zip_path, pipeline_version_name=github_sha, pipeline_id=pipeline_id, ) return pipeline_id
def __init__(self, config, project_name, context): token = self.obtain_id_token() self.host = config.host self.client = Client(self.host, existing_token=token) self.project_name = project_name self.context = context dsl.ContainerOp._DISABLE_REUSABLE_COMPONENT_WARNING = True self.volume_meta = config.run_config.volume
def get_pipline(run_id, wait=0, namespace=None): """Get or wait for Pipeline status, wait time in sec""" client = Client(namespace=namespace or mlconf.namespace) if wait: resp = client.wait_for_run_completion(run_id, wait) else: resp = client.get_run(run_id) return resp
def test_pipelines(name: str, params: list, fn: Callable): """Runs each pipeline that it's been parameterized for, and waits for it to succeed.""" client = Client('127.0.0.1:8888') run = client.create_run_from_pipeline_func( fn, arguments={p['name']: p['value'] for p in params}) completed = client.wait_for_run_completion(run.run_id, timeout=1200) status = completed.to_dict()['run']['status'] assert status == 'Succeeded', f'Pipeline status is {status}'
def get_or_create_experiment(experiment_name: str, client: kfp.Client) -> ApiExperiment: existing_experiments = client.list_experiments().experiments if existing_experiments is not None: exp = next(iter([exp for exp in existing_experiments if exp.name == experiment_name]), None) else: exp = None if exp is None: exp = client.create_experiment(experiment_name) print('Experiment %s created with ID %s' % (exp.name, exp.id)) else: print('Experiment already exists with id %s' % exp.id) return exp
def _get_kfp_client(host=None): user_namespace = podutils.get_namespace() host = 'http://ml-pipeline.kubeflow.svc.cluster.local:8888' log.info("hbseo _get_kfp_client() 'host:%s', namespace:%s", host, user_namespace) token_path = os.environ.get('KF_PIPELINES_SA_TOKEN_PATH') if token_path: with open(token_path, 'r') as f: token = f.read() log.info("hbseo _get_kfp_client() 'token:%s'", token) return Client(host=host, namespace=user_namespace, existing_token=token) else: return Client(host=host, namespace=user_namespace)
def list_piplines( full=False, page_token='', page_size=10, sort_by='', experiment_id=None, namespace=None, ): """List pipelines""" namespace = namespace or mlconf.namespace client = Client(namespace=namespace) resp = client._run_api.list_runs(page_token=page_token, page_size=page_size, sort_by=sort_by) runs = resp.runs if not full and runs: runs = [] for run in resp.runs: runs.append({ k: str(v) for k, v in run.to_dict().items() if k in [ 'id', 'name', 'status', 'error', 'created_at', 'scheduled_at', 'finished_at', 'description', ] }) return resp.total_size, resp.next_page_token, runs
def find_experiment_id(experiment_name: str, client: kfp.Client, page_size: int = 100, page_token: str = "") -> str: """Function to return the experiment id Arguments: experiment_name {str} -- The experiment name client {kfp.Client} -- The kfp client Returns: str -- The experiment id """ while True: experiments = client.list_experiments(page_size=page_size, page_token=page_token) for experiments in experiments.experiments: if experiments.name == experiment_name: logging.info("Succesfully collected the experiment id") return experiments.id # Start need to know where to do next itteration from page_token = experiments.next_page_token # If no next tooken break if not page_token: logging.info( f"Could not find the pipeline id, is the experiment name: {experiments_name} correct? " ) break
def find_pipeline_id(pipeline_name: str, client: kfp.Client, page_size: str = 100, page_token: str = "") -> str: """Function to find the pipeline id of a pipeline. Arguments: pipeline_name {str} -- The name of the pipeline of interest client {kfp.Client} -- The kfp client page_size {str} -- The number of pipelines to collect a each API request Keyword Arguments: page_token {str} -- The page token to use for the API request (default: {" "}) Returns: [type] -- The pipeline id. If None no match """ while True: pipelines = client.list_pipelines(page_size=page_size, page_token=page_token) for pipeline in pipelines.pipelines: if pipeline.name == pipeline_name: logging.info(f"The pipeline id is: {pipeline.id}") return pipeline.id # Start need to know where to do next itteration from page_token = pipelines.next_page_token # If no next tooken break if not page_token: logging.info( f"Could not find the pipeline, is the name: {pipeline_name} correct?" ) break
def init_pipeline_client(): """Create a new kfp client. Returns: An instance of kfp client. """ return Client(getenv("KF_PIPELINES_ENDPOINT", '0.0.0.0:31380/pipeline'))
def kfp_client(): """ Singleton that returns a kfp.Client object. It was changed from constant to a function because the client instance makes a request during __init__ (before the mock API is available), causing tests to fail. Returns ------- kfp.Client """ host = getenv("KF_PIPELINES_ENDPOINT", "ml-pipeline.kubeflow:8888") client = Client(host=host) # user namespace is stored in a configuration file at $HOME/.config/kfp/context.json makedirs(path.join(str(Path.home()), ".config", "kfp"), exist_ok=True) client.set_user_namespace(namespace=KF_PIPELINES_NAMESPACE) return client
def _mock_get_run( kfp_client_mock: kfp.Client, api_run_detail: kfp_server_api.models.api_run_detail.ApiRunDetail, ): def get_run_mock(*args, **kwargs): return api_run_detail kfp_client_mock.get_run = get_run_mock
def run_v2_pipeline( client: kfp.Client, fn: Callable, driver_image: Optional[str], launcher_v2_image: Optional[str], pipeline_root: Optional[str], enable_caching: bool, arguments: dict[str, str], ): import tempfile import subprocess original_pipeline_spec = tempfile.mktemp(suffix='.json', prefix="original_pipeline_spec") kfp.v2.compiler.Compiler().compile(pipeline_func=fn, package_path=original_pipeline_spec) # remove following overriding logic once we use create_run_from_job_spec to trigger kfp pipeline run with open(original_pipeline_spec) as f: pipeline_job_dict = { 'pipelineSpec': json.load(f), 'runtimeConfig': {}, } for component in [pipeline_job_dict['pipelineSpec']['root']] + list( pipeline_job_dict['pipelineSpec']['components'].values()): if 'dag' in component: for task in component['dag']['tasks'].values(): task['cachingOptions'] = {'enableCache': enable_caching} if arguments: pipeline_job_dict['runtimeConfig']['parameterValues'] = {} for k, v in arguments.items(): pipeline_job_dict['runtimeConfig']['parameterValues'][k] = v pipeline_job = tempfile.mktemp(suffix='.json', prefix="pipeline_job") with open(pipeline_job, 'w') as f: json.dump(pipeline_job_dict, f) argo_workflow_spec = tempfile.mktemp(suffix='.yaml') with open(argo_workflow_spec, 'w') as f: args = [ 'kfp-v2-compiler', '--job', pipeline_job, ] if driver_image: args += ['--driver', driver_image] if launcher_v2_image: args += ['--launcher', launcher_v2_image] if pipeline_root: args += ['--pipeline_root', pipeline_root] # call v2 backend compiler CLI to compile pipeline spec to argo workflow subprocess.check_call(args, stdout=f) return client.create_run_from_pipeline_package( pipeline_file=argo_workflow_spec, arguments={}, enable_caching=enable_caching)
def init_pipeline_client(): """Create a new kfp client. Returns: An instance of kfp client. """ return Client(host=getenv('KF_PIPELINES_ENDPOINT', '0.0.0.0:31380/pipeline'), namespace=getenv('KF_PIPELINES_NAMESPACE', 'deployments'))
def get_pipeline( run_id, namespace=None, format_: Union[str, mlrun.api.schemas.PipelinesFormat] = mlrun.api.schemas. PipelinesFormat.summary, project: str = None, remote: bool = True, ): """Get Pipeline status :param run_id: id of pipelines run :param namespace: k8s namespace if not default :param format_: Format of the results. Possible values are: - ``summary`` (default value) - Return summary of the object data. - ``full`` - Return full pipeline object. :param project: the project of the pipeline run :param remote: read kfp data from mlrun service (default=True) :return: kfp run dict """ namespace = namespace or mlconf.namespace if remote: mldb = get_run_db() if mldb.kind != "http": raise ValueError( "get pipeline require access to remote api-service" ", please set the dbpath url") resp = mldb.get_pipeline(run_id, namespace=namespace, format_=format_, project=project) else: client = Client(namespace=namespace) resp = client.get_run(run_id) if resp: resp = resp.to_dict() if (not format_ or format_ == mlrun.api.schemas.PipelinesFormat.summary.value): resp = format_summary_from_kfp_run(resp) show_kfp_run(resp) return resp
def _monitor_kfp_submission(runtime_config: dict, runtime_config_name: str, run_id: str, timeout: int) -> str: """Monitor the status of a Kubeflow Pipelines run""" try: # Authenticate with the KFP server auth_info = KFPAuthenticator().authenticate( runtime_config.metadata["api_endpoint"].rstrip("/"), auth_type_str=runtime_config.metadata.get("auth_type"), runtime_config_name=runtime_config_name, auth_parm_1=runtime_config.metadata.get("api_username"), auth_parm_2=runtime_config.metadata.get("api_password"), ) except AuthenticationError as ae: if ae.get_request_history() is not None: click.echo( "An authentication error was raised. Diagnostic information follows." ) click.echo(ae.request_history_to_string()) raise click.ClickException(f"Kubeflow authentication failed: {ae}") try: # Create a Kubeflow Pipelines client. There is no need to use a Tekton client, # because the monitoring API is agnostic. client = ArgoClient( host=runtime_config.metadata["api_endpoint"].rstrip("/"), cookies=auth_info.get("cookies", None), credentials=auth_info.get("credentials", None), existing_token=auth_info.get("existing_token", None), namespace=runtime_config.metadata.get("user_namespace"), ) # wait for the run to complete or timeout (API uses seconds as unit - convert) run_details = client.wait_for_run_completion(run_id, timeout * 60) except TimeoutError: # pipeline processing did not finish yet, stop monitoring raise except Exception as ex: # log error and return 'unknown' status click.echo(f"Monitoring failed: {type(ex)}: {ex}") return "unknown" else: return run_details.run.status
def _mock_pipelines_creation(kfp_client_mock: kfp.Client): def _mock_create_experiment(name, description=None, namespace=None): return kfp_server_api.models.ApiExperiment( id="some-exp-id", name=name, description=description, ) def _mock_run_pipeline( experiment_id, job_name, pipeline_package_path=None, params={}, pipeline_id=None, version_id=None, ): return kfp_server_api.models.ApiRun(id="some-run-id", name=job_name) kfp_client_mock.create_experiment = _mock_create_experiment kfp_client_mock.run_pipeline = _mock_run_pipeline
def delete_kfp_pipeline(pipeline_id: str): api_instance = KfpClient() try: api_instance.delete_pipeline(pipeline_id) except AttributeError as e: # ignore KFP AttributeError. It is a bug in the Swagger-generated client code for Kubeflow Pipelines if not str( e ) == "module 'kfp_pipeline.models' has no attribute 'ERRORUNKNOWN'": raise e except PipelineApiException as e: kfp_host = api_instance.api_client.configuration.host print( f"Exception when calling PipelineServiceApi ({kfp_host}) -> delete_pipeline: %s\n" % e) raise ApiError(message=f"{e.body}\nKFP URL: {kfp_host}", http_status_code=e.status or 422)
def test_pipelines(): status = json.loads( check_output([ "microk8s", "kubectl", "get", "services/ml-pipeline", "-nkubeflow", "-ojson", ]).decode("utf-8")) ip = status["spec"]["clusterIP"] client = Client(f"http://{ip}:8888") run = client.create_run_from_pipeline_func( download_and_join, arguments={ "url1": "gs://ml-pipeline/sample-data/shakespeare/shakespeare1.txt", "url2": "gs://ml-pipeline/sample-data/shakespeare/shakespeare2.txt", }, ) completed = client.wait_for_run_completion(run.run_id, timeout=3600) status = completed.to_dict()["run"]["status"] assert status == "Succeeded", f"Pipeline cowsay status is {status}"
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Initialization ###### client = Client(namespace=args.namespace) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = args.testname + ' sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = args.testname + '_sample' params = {} response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def check(self): """ Check the pipeline running results of the notebook sample. """ test_cases = [] test_name = self._testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test( test_cases, 'test script execution', (self._exit_code == '0'), 'test script failure with exit code: ' + self._exit_code) try: with open(DEFAULT_CONFIG, 'r') as f: raw_args = yaml.safe_load(f) except yaml.YAMLError as yamlerr: raise RuntimeError('Illegal default config:{}'.format(yamlerr)) except OSError as ose: raise FileExistsError('Default config not found:{}'.format(ose)) else: test_timeout = raw_args['test_timeout'] if self._run_pipeline: experiment = self._experiment_name ###### Initialization ###### client = Client(host=self._host) ###### Get experiments ###### experiment_id = client.get_experiment( experiment_name=experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=RUN_LIST_PAGE_SIZE, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, test_timeout) succ = (response.run.status.lower() == 'succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] print("Argo Workflow Name: ", workflow_id) argo_log, _ = utils.run_bash_command( 'argo logs {} -n {}'.format(workflow_id, self._namespace)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, self._result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, self._result, test_cases)
def pipeline_id(client: kfp.Client, name: str): """Gets the ID of the kubeflow pipeline with the name 'name' Args: name of the pipeline Returns: id of the pipeline """ page_token = "" while page_token is not None: p = client.list_pipelines(page_token=page_token, page_size=100) if p.pipelines is None: return "" for p in p.pipelines: if p.name == name: return p.id page_token = p.next_page_token return ""