コード例 #1
0
def upload_experiments(
    client: kfp.Client,
    pipeline_name: str,
    github_sha: str,
    experiment_name: str = "",
) -> str:
    """Function to upload an experiment to Kubeflow Pipelines.

    For clarity, the experiment will be registered to Kubeflow Pipelines named like below:
        {pipeline_name}-{experiment_name}
    If the experiment does not exist, it will be created newly with specified name.
    If the experiment is not specified, {pipeline_name}-default will be used.

    Args:
        client (kfp.Client) : KFP client.
        pipeline_name (str) : The name of the pipeline function.
        github_sha (str) : GitHub SHA generated in GitHub Actions.
        experiment_name (str) : The experiment name. (Optional)

    Returns:
        str : The ID of the experiment.
    """
    register_name = (f"{pipeline_name}-{experiment_name}"
                     if experiment_name != "Default" else experiment_name)
    try:
        experiment_id = client.get_experiment(
            experiment_name=register_name).to_dict()["id"]
    except ValueError:
        experiment_id = client.create_experiment(
            name=register_name).to_dict()["id"]
        logging.info(f"The experiment is newly registered : {register_name}")
    return experiment_id
コード例 #2
0
    def check(self):
        """ Check the pipeline running results of the notebook sample. """
        test_cases = []
        test_name = self._testname + ' Sample Test'

        ###### Write the script exit code log ######
        utils.add_junit_test(
            test_cases, 'test script execution', (self._exit_code == '0'),
            'test script failure with exit code: ' + self._exit_code)

        try:
            with open(DEFAULT_CONFIG, 'r') as f:
                raw_args = yaml.safe_load(f)
        except yaml.YAMLError as yamlerr:
            raise RuntimeError('Illegal default config:{}'.format(yamlerr))
        except OSError as ose:
            raise FileExistsError('Default config not found:{}'.format(ose))
        else:
            test_timeout = raw_args['test_timeout']

        if self._run_pipeline:
            experiment = self._experiment_name
            ###### Initialization ######
            client = Client(host=self._host)

            ###### Get experiments ######
            experiment_id = client.get_experiment(
                experiment_name=experiment).id

            ###### Get runs ######
            list_runs_response = client.list_runs(page_size=RUN_LIST_PAGE_SIZE,
                                                  experiment_id=experiment_id)

            ###### Check all runs ######
            for run in list_runs_response.runs:
                run_id = run.id
                response = client.wait_for_run_completion(run_id, test_timeout)
                succ = (response.run.status.lower() == 'succeeded')
                utils.add_junit_test(test_cases, 'job completion', succ,
                                     'waiting for job completion failure')

                ###### Output Argo Log for Debugging ######
                workflow_json = client._get_workflow_json(run_id)
                workflow_id = workflow_json['metadata']['name']
                print("Argo Workflow Name: ", workflow_id)
                argo_log, _ = utils.run_bash_command(
                    'argo logs {} -n {}'.format(workflow_id, self._namespace))
                print("=========Argo Workflow Log=========")
                print(argo_log)

                if not succ:
                    utils.write_junit_xml(test_name, self._result, test_cases)
                    exit(1)

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(test_name, self._result, test_cases)
コード例 #3
0
def run_pipeline_func(client: kfp.Client,
                      pipeline_name: str,
                      pipeline_id: str,
                      pipeline_paramters_path: dict,
                      recurring_flag: bool = False,
                      cron_exp: str = ''):
    pipeline_params = read_pipeline_params(
        pipeline_paramters_path=pipeline_paramters_path)
    pipeline_params = pipeline_params if pipeline_params is not None else {}

    experiment_id = None
    experiment_name = "{}-{}".format(pipeline_name,
                                     os.environ["INPUT_EXPERIMENT_NAME"])
    try:
        experiment_id = client.get_experiment(
            experiment_name=experiment_name).to_dict()["id"]
    except ValueError:
        experiment_id = client.create_experiment(
            name=experiment_name).to_dict()["id"]

    namespace = os.getenv("INPUT_PIPELINE_NAMESPACE") if not str.isspace(
        os.getenv("INPUT_PIPELINE_NAMESPACE")) else None

    job_name = 'Run {} on {}'.format(
        pipeline_name,
        datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    logging.info(f"experiment_id: {experiment_id}, \
                 job_name: {job_name}, \
                 pipeline_params: {pipeline_params}, \
                 pipeline_id: {pipeline_id}, \
                 namespace: {namespace}, \
                 cron_exp: {cron_exp}")

    if recurring_flag == "true":
        client.create_recurring_run(experiment_id=experiment_id,
                                    job_name=job_name,
                                    params=pipeline_params,
                                    pipeline_id=pipeline_id,
                                    cron_expression=cron_exp)
        logging.info(
            "Successfully started the recurring pipeline, head over to kubeflow to check it out"
        )

    client.run_pipeline(experiment_id=experiment_id,
                        job_name=job_name,
                        params=pipeline_params,
                        pipeline_id=pipeline_id)
    logging.info(
        "Successfully started the pipeline, head over to kubeflow to check it out"
    )
コード例 #4
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Write the script exit code log ######
    utils.add_junit_test(
        test_cases, 'test script execution', (args.exit_code == '0'),
        'test script failure with exit code: ' + args.exit_code)

    if args.experiment is not None:
        ###### Initialization ######
        host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
        client = Client(host=host)

        ###### Get experiments ######
        experiment_id = client.get_experiment(
            experiment_name=args.experiment).id

        ###### Get runs ######
        list_runs_response = client.list_runs(page_size=1000,
                                              experiment_id=experiment_id)

        ###### Check all runs ######
        for run in list_runs_response.runs:
            run_id = run.id
            response = client.wait_for_run_completion(run_id, 1200)
            succ = (response.run.status.lower() == 'succeeded')
            utils.add_junit_test(test_cases, 'job completion', succ,
                                 'waiting for job completion failure')

            ###### Output Argo Log for Debugging ######
            workflow_json = client._get_workflow_json(run_id)
            workflow_id = workflow_json['metadata']['name']
            argo_log, _ = utils.run_bash_command(
                'argo logs -n {} -w {}'.format(args.namespace, workflow_id))
            print("=========Argo Workflow Log=========")
            print(argo_log)

            if not succ:
                utils.write_junit_xml(test_name, args.result, test_cases)
                exit(1)

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #5
0
    def check(self):
        test_cases = []
        test_name = self._testname + ' Sample Test'

        ###### Write the script exit code log ######
        utils.add_junit_test(test_cases, 'test script execution',
                             (self._exit_code == '0'),
                             'test script failure with exit code: '
                             + self._exit_code)

        if self._experiment is not None:  # Bypassing dsl type check sample.
            ###### Initialization ######
            host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
            client = Client(host=host)

            ###### Get experiments ######
            experiment_id = client.get_experiment(experiment_name=self._experiment).id

            ###### Get runs ######
            list_runs_response = client.list_runs(page_size=_RUN_LIST_PAGE_SIZE,
                                                  experiment_id=experiment_id)

            ###### Check all runs ######
            for run in list_runs_response.runs:
                run_id = run.id
                response = client.wait_for_run_completion(run_id, _TEST_TIMEOUT)
                succ = (response.run.status.lower()=='succeeded')
                utils.add_junit_test(test_cases, 'job completion',
                                     succ, 'waiting for job completion failure')

                ###### Output Argo Log for Debugging ######
                workflow_json = client._get_workflow_json(run_id)
                workflow_id = workflow_json['metadata']['name']
                argo_log, _ = utils.run_bash_command(
                    'argo logs -n {} -w {}'.format(self._namespace, workflow_id))
                print("=========Argo Workflow Log=========")
                print(argo_log)

                if not succ:
                    utils.write_junit_xml(test_name, self._result, test_cases)
                    exit(1)

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(test_name, self._result, test_cases)
コード例 #6
0
ファイル: kfpclient.py プロジェクト: jkogut/kedro-kubeflow
class KubeflowClient(object):

    log = logging.getLogger(__name__)

    def __init__(self, config, project_name, context):
        token = AuthHandler().obtain_id_token()
        self.host = config.host
        self.client = Client(self.host, existing_token=token)
        self.project_name = project_name
        self.pipeline_description = config.run_config.description
        self.generator = PipelineGenerator(config, project_name, context)

    def list_pipelines(self):
        pipelines = self.client.list_pipelines(page_size=30).pipelines
        return tabulate(map(lambda x: [x.name, x.id], pipelines),
                        headers=["Name", "ID"])

    def run_once(
        self,
        pipeline,
        image,
        experiment_name,
        run_name,
        wait,
        image_pull_policy="IfNotPresent",
    ) -> None:
        run = self.client.create_run_from_pipeline_func(
            self.generator.generate_pipeline(pipeline, image,
                                             image_pull_policy),
            arguments={},
            experiment_name=experiment_name,
            run_name=run_name,
        )

        if wait:
            run.wait_for_run_completion(timeout=WAIT_TIMEOUT)

    def compile(self,
                pipeline,
                image,
                output,
                image_pull_policy="IfNotPresent"):
        Compiler().compile(
            self.generator.generate_pipeline(pipeline, image,
                                             image_pull_policy),
            output,
        )
        self.log.info("Generated pipeline definition was saved to %s" % output)

    def upload(self, pipeline, image, image_pull_policy="IfNotPresent"):
        pipeline = self.generator.generate_pipeline(pipeline, image,
                                                    image_pull_policy)

        if self._pipeline_exists(self.project_name):
            pipeline_id = self._get_pipeline_id(self.project_name)
            version_id = self._upload_pipeline_version(pipeline, pipeline_id)
            self.log.info("New version of pipeline created: %s", version_id)
        else:
            (pipeline_id, version_id) = self._upload_pipeline(pipeline)
            self.log.info("Pipeline created")

        self.log.info(
            f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s",
            pipeline_id,
            version_id,
        )

    def _pipeline_exists(self, pipeline_name):
        return self._get_pipeline_id(pipeline_name) is not None

    def _get_pipeline_id(self, pipeline_name):
        pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({
            "predicates": [{
                "key": "name",
                "op": 1,
                "string_value": pipeline_name,
            }]
        })).pipelines

        if pipelines:
            return pipelines[0].id

    def _upload_pipeline_version(self, pipeline_func, pipeline_id):
        version_name = f"{clean_name(self.project_name)}-{uuid.uuid4()}"[:100]
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            return self.client.pipeline_uploads.upload_pipeline_version(
                f.name,
                name=version_name,
                pipelineid=pipeline_id,
                _request_timeout=10000,
            ).id

    def _upload_pipeline(self, pipeline_func):
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            pipeline = self.client.pipeline_uploads.upload_pipeline(
                f.name,
                name=self.project_name,
                description=self.pipeline_description,
                _request_timeout=10000,
            )
            return (pipeline.id, pipeline.default_version.id)

    def _ensure_experiment_exists(self, experiment_name):
        try:
            experiment = self.client.get_experiment(
                experiment_name=experiment_name)
            self.log.info(f"Existing experiment found: {experiment.id}")
        except ValueError as e:
            if not str(e).startswith("No experiment is found"):
                raise

            experiment = self.client.create_experiment(experiment_name)
            self.log.info(f"New experiment created: {experiment.id}")

        return experiment.id

    def schedule(self, experiment_name, cron_expression):
        experiment_id = self._ensure_experiment_exists(experiment_name)
        pipeline_id = self._get_pipeline_id(self.project_name)
        self._disable_runs(experiment_id, pipeline_id)
        self.client.create_recurring_run(
            experiment_id,
            f"{self.project_name} on {cron_expression}",
            cron_expression=cron_expression,
            pipeline_id=pipeline_id,
        )
        self.log.info("Pipeline scheduled to %s", cron_expression)

    def _disable_runs(self, experiment_id, pipeline_id):
        runs = self.client.list_recurring_runs(experiment_id=experiment_id)
        if runs.jobs is not None:
            my_runs = [
                job for job in runs.jobs
                if job.pipeline_spec.pipeline_id == pipeline_id
            ]
            for job in my_runs:
                self.client.jobs.delete_job(job.id)
                self.log.info(f"Previous schedule deleted {job.id}")
コード例 #7
0
ファイル: kfpclient.py プロジェクト: fossabot/kedro-kubeflow
class KubeflowClient(object):

    log = logging.getLogger(__name__)

    def __init__(self, config, project_name, context):
        token = self.obtain_id_token()
        self.host = config.host
        self.client = Client(self.host, existing_token=token)
        self.project_name = project_name
        self.context = context
        dsl.ContainerOp._DISABLE_REUSABLE_COMPONENT_WARNING = True
        self.volume_meta = config.run_config.volume

    def list_pipelines(self):
        pipelines = self.client.list_pipelines(page_size=30).pipelines
        return tabulate(map(lambda x: [x.name, x.id], pipelines),
                        headers=["Name", "ID"])

    def run_once(
        self,
        pipeline,
        image,
        experiment_name,
        run_name,
        wait,
        image_pull_policy="IfNotPresent",
    ) -> None:
        run = self.client.create_run_from_pipeline_func(
            self.generate_pipeline(pipeline, image, image_pull_policy),
            arguments={},
            experiment_name=experiment_name,
            run_name=run_name,
        )

        if wait:
            run.wait_for_run_completion(timeout=WAIT_TIMEOUT)

    def obtain_id_token(self):
        from google.auth.transport.requests import Request
        from google.oauth2 import id_token
        from google.auth.exceptions import DefaultCredentialsError

        client_id = os.environ.get(IAP_CLIENT_ID, None)

        jwt_token = None

        if not client_id:
            self.log.info(
                "No IAP_CLIENT_ID provided, skipping custom IAP authentication"
            )
            return jwt_token

        try:
            self.log.debug("Obtaining JWT token for %s." + client_id)
            jwt_token = id_token.fetch_id_token(Request(), client_id)
            self.log.info("Obtained JWT token for MLFLOW connectivity.")
        except DefaultCredentialsError as ex:
            self.log.warning(
                str(ex) +
                (" Note that this authentication method does not work with default"
                 " credentials obtained via 'gcloud auth application-default login'"
                 " command. Refer to documentation on how to configure service account"
                 " locally"
                 " (https://cloud.google.com/docs/authentication/production#manually)"
                 ))
        except Exception as e:
            self.log.error("Failed to obtain IAP access token. " + str(e))
        finally:
            return jwt_token

    def generate_pipeline(self, pipeline, image, image_pull_policy):
        @dsl.pipeline(
            name=self.project_name,
            description="Kubeflow pipeline for Kedro project",
        )
        def convert_kedro_pipeline_to_kfp() -> None:
            """Convert from a Kedro pipeline into a kfp container graph."""

            node_volumes = (_setup_volumes()
                            if self.volume_meta is not None else {})
            node_dependencies = self.context.pipelines.get(
                pipeline).node_dependencies
            kfp_ops = _build_kfp_ops(node_dependencies, node_volumes)
            for node, dependencies in node_dependencies.items():
                for dependency in dependencies:
                    kfp_ops[node.name].after(kfp_ops[dependency.name])

        def _setup_volumes():
            vop = dsl.VolumeOp(
                name="data-volume-create",
                resource_name="data-volume",
                size=self.volume_meta.size,
                modes=self.volume_meta.access_modes,
                storage_class=self.volume_meta.storageclass,
            )
            if self.volume_meta.skip_init:
                return {"/home/kedro/data": vop.volume}
            else:
                volume_init = dsl.ContainerOp(
                    name="data-volume-init",
                    image=image,
                    command=["sh", "-c"],
                    arguments=[
                        " ".join([
                            "cp",
                            "--verbose",
                            "-r",
                            "/home/kedro/data/*",
                            "/home/kedro/datavolume",
                        ])
                    ],
                    pvolumes={"/home/kedro/datavolume": vop.volume},
                )
                volume_init.container.set_image_pull_policy(image_pull_policy)
                return {"/home/kedro/data": volume_init.pvolume}

        def _build_kfp_ops(node_dependencies: Dict[Node, Set[Node]],
                           node_volumes: Dict) -> Dict[str, dsl.ContainerOp]:
            """Build kfp container graph from Kedro node dependencies. """
            kfp_ops = {}

            env = [
                V1EnvVar(name=IAP_CLIENT_ID,
                         value=os.environ.get(IAP_CLIENT_ID, ""))
            ]

            if is_mlflow_enabled():
                kfp_ops["mlflow-start-run"] = dsl.ContainerOp(
                    name="mlflow-start-run",
                    image=image,
                    command=["kedro"],
                    arguments=[
                        "kubeflow",
                        "mlflow-start",
                        dsl.RUN_ID_PLACEHOLDER,
                    ],
                    file_outputs={"mlflow_run_id": "/tmp/mlflow_run_id"},
                )
                kfp_ops["mlflow-start-run"].container.set_image_pull_policy(
                    image_pull_policy)
                env.append(
                    V1EnvVar(
                        name="MLFLOW_RUN_ID",
                        value=kfp_ops["mlflow-start-run"].output,
                    ))

            for node in node_dependencies:
                name = _clean_name(node.name)
                kfp_ops[node.name] = dsl.ContainerOp(
                    name=name,
                    image=image,
                    command=["kedro"],
                    arguments=["run", "--node", node.name],
                    pvolumes=node_volumes,
                    container_kwargs={"env": env},
                )
                kfp_ops[node.name].container.set_image_pull_policy(
                    image_pull_policy)

            return kfp_ops

        return convert_kedro_pipeline_to_kfp

    def compile(self,
                pipeline,
                image,
                output,
                image_pull_policy="IfNotPresent"):
        Compiler().compile(
            self.generate_pipeline(pipeline, image, image_pull_policy), output)
        self.log.info("Generated pipeline definition was saved to %s" % output)

    def upload(self, pipeline, image, image_pull_policy="IfNotPresent"):
        pipeline = self.generate_pipeline(pipeline, image, image_pull_policy)

        if self._pipeline_exists(self.project_name):
            pipeline_id = self._get_pipeline_id(self.project_name)
            version_id = self._upload_pipeline_version(pipeline, pipeline_id,
                                                       self.project_name)
            self.log.info("New version of pipeline created: %s", version_id)
        else:
            (pipeline_id,
             version_id) = self._upload_pipeline(pipeline, self.project_name)
            self.log.info("Pipeline created")

        self.log.info(
            f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s",
            pipeline_id,
            version_id,
        )

    def _pipeline_exists(self, pipeline_name):
        return self._get_pipeline_id(pipeline_name) is not None

    def _get_pipeline_id(self, pipeline_name):
        pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({
            "predicates": [{
                "key": "name",
                "op": 1,
                "string_value": pipeline_name,
            }]
        })).pipelines

        if pipelines:
            return pipelines[0].id

    def _upload_pipeline_version(self, pipeline_func, pipeline_id,
                                 pipeline_name):
        version_name = f"{_clean_name(pipeline_name)}-{uuid.uuid4()}"[:100]
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            return self.client.pipeline_uploads.upload_pipeline_version(
                f.name, name=version_name, pipelineid=pipeline_id).id

    def _upload_pipeline(self, pipeline_func, pipeline_name):
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            pipeline = self.client.pipeline_uploads.upload_pipeline(
                f.name, name=pipeline_name)
            return (pipeline.id, pipeline.default_version.id)

    def _ensure_experiment_exists(self, experiment_name):
        try:
            experiment = self.client.get_experiment(
                experiment_name=experiment_name)
            self.log.info(f"Existing experiment found: {experiment.id}")
        except ValueError as e:
            if not str(e).startswith("No experiment is found"):
                raise

            experiment = self.client.create_experiment(experiment_name)
            self.log.info(f"New experiment created: {experiment.id}")

        return experiment.id

    def schedule(self, experiment_name, cron_expression):
        experiment_id = self._ensure_experiment_exists(experiment_name)
        pipeline_id = self._get_pipeline_id(self.project_name)
        self._disable_runs(experiment_id, pipeline_id)
        self.client.create_recurring_run(
            experiment_id,
            f"{self.project_name} on {cron_expression}",
            cron_expression=cron_expression,
            pipeline_id=pipeline_id,
        )
        self.log.info("Pipeline scheduled to %s", cron_expression)

    def _disable_runs(self, experiment_id, pipeline_id):
        runs = self.client.list_recurring_runs(experiment_id=experiment_id)
        if runs.jobs is not None:
            my_runs = [
                job for job in runs.jobs
                if job.pipeline_spec.pipeline_id == pipeline_id
            ]
            for job in my_runs:
                self.client.jobs.delete_job(job.id)
                self.log.info(f"Previous schedule deleted {job.id}")