def upload_experiments( client: kfp.Client, pipeline_name: str, github_sha: str, experiment_name: str = "", ) -> str: """Function to upload an experiment to Kubeflow Pipelines. For clarity, the experiment will be registered to Kubeflow Pipelines named like below: {pipeline_name}-{experiment_name} If the experiment does not exist, it will be created newly with specified name. If the experiment is not specified, {pipeline_name}-default will be used. Args: client (kfp.Client) : KFP client. pipeline_name (str) : The name of the pipeline function. github_sha (str) : GitHub SHA generated in GitHub Actions. experiment_name (str) : The experiment name. (Optional) Returns: str : The ID of the experiment. """ register_name = (f"{pipeline_name}-{experiment_name}" if experiment_name != "Default" else experiment_name) try: experiment_id = client.get_experiment( experiment_name=register_name).to_dict()["id"] except ValueError: experiment_id = client.create_experiment( name=register_name).to_dict()["id"] logging.info(f"The experiment is newly registered : {register_name}") return experiment_id
def check(self): """ Check the pipeline running results of the notebook sample. """ test_cases = [] test_name = self._testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test( test_cases, 'test script execution', (self._exit_code == '0'), 'test script failure with exit code: ' + self._exit_code) try: with open(DEFAULT_CONFIG, 'r') as f: raw_args = yaml.safe_load(f) except yaml.YAMLError as yamlerr: raise RuntimeError('Illegal default config:{}'.format(yamlerr)) except OSError as ose: raise FileExistsError('Default config not found:{}'.format(ose)) else: test_timeout = raw_args['test_timeout'] if self._run_pipeline: experiment = self._experiment_name ###### Initialization ###### client = Client(host=self._host) ###### Get experiments ###### experiment_id = client.get_experiment( experiment_name=experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=RUN_LIST_PAGE_SIZE, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, test_timeout) succ = (response.run.status.lower() == 'succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] print("Argo Workflow Name: ", workflow_id) argo_log, _ = utils.run_bash_command( 'argo logs {} -n {}'.format(workflow_id, self._namespace)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, self._result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, self._result, test_cases)
def run_pipeline_func(client: kfp.Client, pipeline_name: str, pipeline_id: str, pipeline_paramters_path: dict, recurring_flag: bool = False, cron_exp: str = ''): pipeline_params = read_pipeline_params( pipeline_paramters_path=pipeline_paramters_path) pipeline_params = pipeline_params if pipeline_params is not None else {} experiment_id = None experiment_name = "{}-{}".format(pipeline_name, os.environ["INPUT_EXPERIMENT_NAME"]) try: experiment_id = client.get_experiment( experiment_name=experiment_name).to_dict()["id"] except ValueError: experiment_id = client.create_experiment( name=experiment_name).to_dict()["id"] namespace = os.getenv("INPUT_PIPELINE_NAMESPACE") if not str.isspace( os.getenv("INPUT_PIPELINE_NAMESPACE")) else None job_name = 'Run {} on {}'.format( pipeline_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logging.info(f"experiment_id: {experiment_id}, \ job_name: {job_name}, \ pipeline_params: {pipeline_params}, \ pipeline_id: {pipeline_id}, \ namespace: {namespace}, \ cron_exp: {cron_exp}") if recurring_flag == "true": client.create_recurring_run(experiment_id=experiment_id, job_name=job_name, params=pipeline_params, pipeline_id=pipeline_id, cron_expression=cron_exp) logging.info( "Successfully started the recurring pipeline, head over to kubeflow to check it out" ) client.run_pipeline(experiment_id=experiment_id, job_name=job_name, params=pipeline_params, pipeline_id=pipeline_id) logging.info( "Successfully started the pipeline, head over to kubeflow to check it out" )
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test( test_cases, 'test script execution', (args.exit_code == '0'), 'test script failure with exit code: ' + args.exit_code) if args.experiment is not None: ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace client = Client(host=host) ###### Get experiments ###### experiment_id = client.get_experiment( experiment_name=args.experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=1000, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command( 'argo logs -n {} -w {}'.format(args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def check(self): test_cases = [] test_name = self._testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test(test_cases, 'test script execution', (self._exit_code == '0'), 'test script failure with exit code: ' + self._exit_code) if self._experiment is not None: # Bypassing dsl type check sample. ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace client = Client(host=host) ###### Get experiments ###### experiment_id = client.get_experiment(experiment_name=self._experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=_RUN_LIST_PAGE_SIZE, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, _TEST_TIMEOUT) succ = (response.run.status.lower()=='succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command( 'argo logs -n {} -w {}'.format(self._namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, self._result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, self._result, test_cases)
class KubeflowClient(object): log = logging.getLogger(__name__) def __init__(self, config, project_name, context): token = AuthHandler().obtain_id_token() self.host = config.host self.client = Client(self.host, existing_token=token) self.project_name = project_name self.pipeline_description = config.run_config.description self.generator = PipelineGenerator(config, project_name, context) def list_pipelines(self): pipelines = self.client.list_pipelines(page_size=30).pipelines return tabulate(map(lambda x: [x.name, x.id], pipelines), headers=["Name", "ID"]) def run_once( self, pipeline, image, experiment_name, run_name, wait, image_pull_policy="IfNotPresent", ) -> None: run = self.client.create_run_from_pipeline_func( self.generator.generate_pipeline(pipeline, image, image_pull_policy), arguments={}, experiment_name=experiment_name, run_name=run_name, ) if wait: run.wait_for_run_completion(timeout=WAIT_TIMEOUT) def compile(self, pipeline, image, output, image_pull_policy="IfNotPresent"): Compiler().compile( self.generator.generate_pipeline(pipeline, image, image_pull_policy), output, ) self.log.info("Generated pipeline definition was saved to %s" % output) def upload(self, pipeline, image, image_pull_policy="IfNotPresent"): pipeline = self.generator.generate_pipeline(pipeline, image, image_pull_policy) if self._pipeline_exists(self.project_name): pipeline_id = self._get_pipeline_id(self.project_name) version_id = self._upload_pipeline_version(pipeline, pipeline_id) self.log.info("New version of pipeline created: %s", version_id) else: (pipeline_id, version_id) = self._upload_pipeline(pipeline) self.log.info("Pipeline created") self.log.info( f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s", pipeline_id, version_id, ) def _pipeline_exists(self, pipeline_name): return self._get_pipeline_id(pipeline_name) is not None def _get_pipeline_id(self, pipeline_name): pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({ "predicates": [{ "key": "name", "op": 1, "string_value": pipeline_name, }] })).pipelines if pipelines: return pipelines[0].id def _upload_pipeline_version(self, pipeline_func, pipeline_id): version_name = f"{clean_name(self.project_name)}-{uuid.uuid4()}"[:100] with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) return self.client.pipeline_uploads.upload_pipeline_version( f.name, name=version_name, pipelineid=pipeline_id, _request_timeout=10000, ).id def _upload_pipeline(self, pipeline_func): with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) pipeline = self.client.pipeline_uploads.upload_pipeline( f.name, name=self.project_name, description=self.pipeline_description, _request_timeout=10000, ) return (pipeline.id, pipeline.default_version.id) def _ensure_experiment_exists(self, experiment_name): try: experiment = self.client.get_experiment( experiment_name=experiment_name) self.log.info(f"Existing experiment found: {experiment.id}") except ValueError as e: if not str(e).startswith("No experiment is found"): raise experiment = self.client.create_experiment(experiment_name) self.log.info(f"New experiment created: {experiment.id}") return experiment.id def schedule(self, experiment_name, cron_expression): experiment_id = self._ensure_experiment_exists(experiment_name) pipeline_id = self._get_pipeline_id(self.project_name) self._disable_runs(experiment_id, pipeline_id) self.client.create_recurring_run( experiment_id, f"{self.project_name} on {cron_expression}", cron_expression=cron_expression, pipeline_id=pipeline_id, ) self.log.info("Pipeline scheduled to %s", cron_expression) def _disable_runs(self, experiment_id, pipeline_id): runs = self.client.list_recurring_runs(experiment_id=experiment_id) if runs.jobs is not None: my_runs = [ job for job in runs.jobs if job.pipeline_spec.pipeline_id == pipeline_id ] for job in my_runs: self.client.jobs.delete_job(job.id) self.log.info(f"Previous schedule deleted {job.id}")
class KubeflowClient(object): log = logging.getLogger(__name__) def __init__(self, config, project_name, context): token = self.obtain_id_token() self.host = config.host self.client = Client(self.host, existing_token=token) self.project_name = project_name self.context = context dsl.ContainerOp._DISABLE_REUSABLE_COMPONENT_WARNING = True self.volume_meta = config.run_config.volume def list_pipelines(self): pipelines = self.client.list_pipelines(page_size=30).pipelines return tabulate(map(lambda x: [x.name, x.id], pipelines), headers=["Name", "ID"]) def run_once( self, pipeline, image, experiment_name, run_name, wait, image_pull_policy="IfNotPresent", ) -> None: run = self.client.create_run_from_pipeline_func( self.generate_pipeline(pipeline, image, image_pull_policy), arguments={}, experiment_name=experiment_name, run_name=run_name, ) if wait: run.wait_for_run_completion(timeout=WAIT_TIMEOUT) def obtain_id_token(self): from google.auth.transport.requests import Request from google.oauth2 import id_token from google.auth.exceptions import DefaultCredentialsError client_id = os.environ.get(IAP_CLIENT_ID, None) jwt_token = None if not client_id: self.log.info( "No IAP_CLIENT_ID provided, skipping custom IAP authentication" ) return jwt_token try: self.log.debug("Obtaining JWT token for %s." + client_id) jwt_token = id_token.fetch_id_token(Request(), client_id) self.log.info("Obtained JWT token for MLFLOW connectivity.") except DefaultCredentialsError as ex: self.log.warning( str(ex) + (" Note that this authentication method does not work with default" " credentials obtained via 'gcloud auth application-default login'" " command. Refer to documentation on how to configure service account" " locally" " (https://cloud.google.com/docs/authentication/production#manually)" )) except Exception as e: self.log.error("Failed to obtain IAP access token. " + str(e)) finally: return jwt_token def generate_pipeline(self, pipeline, image, image_pull_policy): @dsl.pipeline( name=self.project_name, description="Kubeflow pipeline for Kedro project", ) def convert_kedro_pipeline_to_kfp() -> None: """Convert from a Kedro pipeline into a kfp container graph.""" node_volumes = (_setup_volumes() if self.volume_meta is not None else {}) node_dependencies = self.context.pipelines.get( pipeline).node_dependencies kfp_ops = _build_kfp_ops(node_dependencies, node_volumes) for node, dependencies in node_dependencies.items(): for dependency in dependencies: kfp_ops[node.name].after(kfp_ops[dependency.name]) def _setup_volumes(): vop = dsl.VolumeOp( name="data-volume-create", resource_name="data-volume", size=self.volume_meta.size, modes=self.volume_meta.access_modes, storage_class=self.volume_meta.storageclass, ) if self.volume_meta.skip_init: return {"/home/kedro/data": vop.volume} else: volume_init = dsl.ContainerOp( name="data-volume-init", image=image, command=["sh", "-c"], arguments=[ " ".join([ "cp", "--verbose", "-r", "/home/kedro/data/*", "/home/kedro/datavolume", ]) ], pvolumes={"/home/kedro/datavolume": vop.volume}, ) volume_init.container.set_image_pull_policy(image_pull_policy) return {"/home/kedro/data": volume_init.pvolume} def _build_kfp_ops(node_dependencies: Dict[Node, Set[Node]], node_volumes: Dict) -> Dict[str, dsl.ContainerOp]: """Build kfp container graph from Kedro node dependencies. """ kfp_ops = {} env = [ V1EnvVar(name=IAP_CLIENT_ID, value=os.environ.get(IAP_CLIENT_ID, "")) ] if is_mlflow_enabled(): kfp_ops["mlflow-start-run"] = dsl.ContainerOp( name="mlflow-start-run", image=image, command=["kedro"], arguments=[ "kubeflow", "mlflow-start", dsl.RUN_ID_PLACEHOLDER, ], file_outputs={"mlflow_run_id": "/tmp/mlflow_run_id"}, ) kfp_ops["mlflow-start-run"].container.set_image_pull_policy( image_pull_policy) env.append( V1EnvVar( name="MLFLOW_RUN_ID", value=kfp_ops["mlflow-start-run"].output, )) for node in node_dependencies: name = _clean_name(node.name) kfp_ops[node.name] = dsl.ContainerOp( name=name, image=image, command=["kedro"], arguments=["run", "--node", node.name], pvolumes=node_volumes, container_kwargs={"env": env}, ) kfp_ops[node.name].container.set_image_pull_policy( image_pull_policy) return kfp_ops return convert_kedro_pipeline_to_kfp def compile(self, pipeline, image, output, image_pull_policy="IfNotPresent"): Compiler().compile( self.generate_pipeline(pipeline, image, image_pull_policy), output) self.log.info("Generated pipeline definition was saved to %s" % output) def upload(self, pipeline, image, image_pull_policy="IfNotPresent"): pipeline = self.generate_pipeline(pipeline, image, image_pull_policy) if self._pipeline_exists(self.project_name): pipeline_id = self._get_pipeline_id(self.project_name) version_id = self._upload_pipeline_version(pipeline, pipeline_id, self.project_name) self.log.info("New version of pipeline created: %s", version_id) else: (pipeline_id, version_id) = self._upload_pipeline(pipeline, self.project_name) self.log.info("Pipeline created") self.log.info( f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s", pipeline_id, version_id, ) def _pipeline_exists(self, pipeline_name): return self._get_pipeline_id(pipeline_name) is not None def _get_pipeline_id(self, pipeline_name): pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({ "predicates": [{ "key": "name", "op": 1, "string_value": pipeline_name, }] })).pipelines if pipelines: return pipelines[0].id def _upload_pipeline_version(self, pipeline_func, pipeline_id, pipeline_name): version_name = f"{_clean_name(pipeline_name)}-{uuid.uuid4()}"[:100] with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) return self.client.pipeline_uploads.upload_pipeline_version( f.name, name=version_name, pipelineid=pipeline_id).id def _upload_pipeline(self, pipeline_func, pipeline_name): with NamedTemporaryFile(suffix=".yaml") as f: Compiler().compile(pipeline_func, f.name) pipeline = self.client.pipeline_uploads.upload_pipeline( f.name, name=pipeline_name) return (pipeline.id, pipeline.default_version.id) def _ensure_experiment_exists(self, experiment_name): try: experiment = self.client.get_experiment( experiment_name=experiment_name) self.log.info(f"Existing experiment found: {experiment.id}") except ValueError as e: if not str(e).startswith("No experiment is found"): raise experiment = self.client.create_experiment(experiment_name) self.log.info(f"New experiment created: {experiment.id}") return experiment.id def schedule(self, experiment_name, cron_expression): experiment_id = self._ensure_experiment_exists(experiment_name) pipeline_id = self._get_pipeline_id(self.project_name) self._disable_runs(experiment_id, pipeline_id) self.client.create_recurring_run( experiment_id, f"{self.project_name} on {cron_expression}", cron_expression=cron_expression, pipeline_id=pipeline_id, ) self.log.info("Pipeline scheduled to %s", cron_expression) def _disable_runs(self, experiment_id, pipeline_id): runs = self.client.list_recurring_runs(experiment_id=experiment_id) if runs.jobs is not None: my_runs = [ job for job in runs.jobs if job.pipeline_spec.pipeline_id == pipeline_id ] for job in my_runs: self.client.jobs.delete_job(job.id) self.log.info(f"Previous schedule deleted {job.id}")