Esempio n. 1
0
def run_pipeline_in_experiment(api_pipeline: ApiPipeline, parameters: dict = None, run_name: str = None,
                               wait_for_status: bool = False):
    try:
        client = KfpClient(_pipeline_service_url)
        experiment = client.create_experiment('PIPELINE_RUNS')
        run_result = client.run_pipeline(experiment_id=experiment.id,
                                         job_name=run_name or api_pipeline.name,
                                         params=parameters,
                                         pipeline_id=api_pipeline.id)
        run_id = run_result.id

        if wait_for_status:

            run_details = wait_for_run_status(client, run_id, 10)
            run_status = json.loads(run_details.pipeline_runtime.workflow_manifest)["status"]

            if run_status \
                    and run_status.get("phase", "").lower() in ["failed", "error"] \
                    and run_status.get("message"):
                raise RuntimeError(f"Run {run_id} failed with error: {run_status['message']}")

        return run_id

    except Exception as e:
        print(f"Exception trying to run pipeline {api_pipeline.id} '{api_pipeline.name}'"
              f" with parameters {parameters}:"
              f" %s\n" % e)
        raise ApiError(message=f"{e.body}\nKFP URL: {_pipeline_service_url}", http_status_code=e.status or 422)

    return None
Esempio n. 2
0
def upload_experiments(
    client: kfp.Client,
    pipeline_name: str,
    github_sha: str,
    experiment_name: str = "",
) -> str:
    """Function to upload an experiment to Kubeflow Pipelines.

    For clarity, the experiment will be registered to Kubeflow Pipelines named like below:
        {pipeline_name}-{experiment_name}
    If the experiment does not exist, it will be created newly with specified name.
    If the experiment is not specified, {pipeline_name}-default will be used.

    Args:
        client (kfp.Client) : KFP client.
        pipeline_name (str) : The name of the pipeline function.
        github_sha (str) : GitHub SHA generated in GitHub Actions.
        experiment_name (str) : The experiment name. (Optional)

    Returns:
        str : The ID of the experiment.
    """
    register_name = (f"{pipeline_name}-{experiment_name}"
                     if experiment_name != "Default" else experiment_name)
    try:
        experiment_id = client.get_experiment(
            experiment_name=register_name).to_dict()["id"]
    except ValueError:
        experiment_id = client.create_experiment(
            name=register_name).to_dict()["id"]
        logging.info(f"The experiment is newly registered : {register_name}")
    return experiment_id
Esempio n. 3
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    client = Client(namespace=args.namespace)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = args.testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = args.testname + '_sample'
    params = {}
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1200)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        args.namespace, workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
Esempio n. 4
0
def run_pipeline_func(client: kfp.Client,
                      pipeline_name: str,
                      pipeline_id: str,
                      pipeline_paramters_path: dict,
                      recurring_flag: bool = False,
                      cron_exp: str = ''):
    pipeline_params = read_pipeline_params(
        pipeline_paramters_path=pipeline_paramters_path)
    pipeline_params = pipeline_params if pipeline_params is not None else {}

    experiment_id = None
    experiment_name = "{}-{}".format(pipeline_name,
                                     os.environ["INPUT_EXPERIMENT_NAME"])
    try:
        experiment_id = client.get_experiment(
            experiment_name=experiment_name).to_dict()["id"]
    except ValueError:
        experiment_id = client.create_experiment(
            name=experiment_name).to_dict()["id"]

    namespace = os.getenv("INPUT_PIPELINE_NAMESPACE") if not str.isspace(
        os.getenv("INPUT_PIPELINE_NAMESPACE")) else None

    job_name = 'Run {} on {}'.format(
        pipeline_name,
        datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    logging.info(f"experiment_id: {experiment_id}, \
                 job_name: {job_name}, \
                 pipeline_params: {pipeline_params}, \
                 pipeline_id: {pipeline_id}, \
                 namespace: {namespace}, \
                 cron_exp: {cron_exp}")

    if recurring_flag == "true":
        client.create_recurring_run(experiment_id=experiment_id,
                                    job_name=job_name,
                                    params=pipeline_params,
                                    pipeline_id=pipeline_id,
                                    cron_expression=cron_exp)
        logging.info(
            "Successfully started the recurring pipeline, head over to kubeflow to check it out"
        )

    client.run_pipeline(experiment_id=experiment_id,
                        job_name=job_name,
                        params=pipeline_params,
                        pipeline_id=pipeline_id)
    logging.info(
        "Successfully started the pipeline, head over to kubeflow to check it out"
    )
Esempio n. 5
0
def get_or_create_experiment(experiment_name: str, client: kfp.Client) -> ApiExperiment:
    existing_experiments = client.list_experiments().experiments

    if existing_experiments is not None:
        exp = next(iter([exp for exp in existing_experiments if exp.name == experiment_name]), None)
    else:
        exp = None

    if exp is None:
        exp = client.create_experiment(experiment_name)
        print('Experiment %s created with ID %s' % (exp.name, exp.id))
    else:
        print('Experiment already exists with id %s' % exp.id)

    return exp
Esempio n. 6
0
def _mock_pipelines_creation(kfp_client_mock: kfp.Client):
    def _mock_create_experiment(name, description=None, namespace=None):
        return kfp_server_api.models.ApiExperiment(
            id="some-exp-id",
            name=name,
            description=description,
        )

    def _mock_run_pipeline(
        experiment_id,
        job_name,
        pipeline_package_path=None,
        params={},
        pipeline_id=None,
        version_id=None,
    ):
        return kfp_server_api.models.ApiRun(id="some-run-id", name=job_name)

    kfp_client_mock.create_experiment = _mock_create_experiment
    kfp_client_mock.run_pipeline = _mock_run_pipeline
Esempio n. 7
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'Kubeflow Sample Test'

    ###### Initialization ######
    client = Client()

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit()

    ###### Create Experiment ######
    experiment_name = 'kubeflow sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'kubeflow_sample'
    params = {
        'output': args.output,
        'project': 'ml-pipeline-test',
        'evaluation': 'gs://ml-pipeline-dataset/sample-test/flower/eval15.csv',
        'train': 'gs://ml-pipeline-dataset/sample-test/flower/train30.csv',
        'hidden-layer-size': '10,5',
        'steps': '5'
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1200)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)
    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit()

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    #TODO: remove the namespace dependency or make is configurable.
    argo_log, _ = utils.run_bash_command(
        'argo logs -n kubeflow -w {}'.format(workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    ###### Validate the results ######
    #   confusion matrix should show three columns for the flower data
    #     target, predicted, count
    cm_tar_path = './confusion_matrix.tar.gz'
    cm_filename = 'mlpipeline-ui-metadata.json'
    utils.get_artifact_in_minio(workflow_json, 'confusionmatrix', cm_tar_path)
    tar_handler = tarfile.open(cm_tar_path)
    tar_handler.extractall()

    with open(cm_filename, 'r') as f:
        cm_data = json.load(f)
        utils.add_junit_test(
            test_cases, 'confusion matrix format',
            (len(cm_data['outputs'][0]['schema']) == 3),
            'the column number of the confusion matrix output is not equal to three'
        )

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
Esempio n. 8
0
class KubeflowClient(object):

    log = logging.getLogger(__name__)

    def __init__(self, config, project_name, context):
        token = AuthHandler().obtain_id_token()
        self.host = config.host
        self.client = Client(self.host, existing_token=token)
        self.project_name = project_name
        self.pipeline_description = config.run_config.description
        self.generator = PipelineGenerator(config, project_name, context)

    def list_pipelines(self):
        pipelines = self.client.list_pipelines(page_size=30).pipelines
        return tabulate(map(lambda x: [x.name, x.id], pipelines),
                        headers=["Name", "ID"])

    def run_once(
        self,
        pipeline,
        image,
        experiment_name,
        run_name,
        wait,
        image_pull_policy="IfNotPresent",
    ) -> None:
        run = self.client.create_run_from_pipeline_func(
            self.generator.generate_pipeline(pipeline, image,
                                             image_pull_policy),
            arguments={},
            experiment_name=experiment_name,
            run_name=run_name,
        )

        if wait:
            run.wait_for_run_completion(timeout=WAIT_TIMEOUT)

    def compile(self,
                pipeline,
                image,
                output,
                image_pull_policy="IfNotPresent"):
        Compiler().compile(
            self.generator.generate_pipeline(pipeline, image,
                                             image_pull_policy),
            output,
        )
        self.log.info("Generated pipeline definition was saved to %s" % output)

    def upload(self, pipeline, image, image_pull_policy="IfNotPresent"):
        pipeline = self.generator.generate_pipeline(pipeline, image,
                                                    image_pull_policy)

        if self._pipeline_exists(self.project_name):
            pipeline_id = self._get_pipeline_id(self.project_name)
            version_id = self._upload_pipeline_version(pipeline, pipeline_id)
            self.log.info("New version of pipeline created: %s", version_id)
        else:
            (pipeline_id, version_id) = self._upload_pipeline(pipeline)
            self.log.info("Pipeline created")

        self.log.info(
            f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s",
            pipeline_id,
            version_id,
        )

    def _pipeline_exists(self, pipeline_name):
        return self._get_pipeline_id(pipeline_name) is not None

    def _get_pipeline_id(self, pipeline_name):
        pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({
            "predicates": [{
                "key": "name",
                "op": 1,
                "string_value": pipeline_name,
            }]
        })).pipelines

        if pipelines:
            return pipelines[0].id

    def _upload_pipeline_version(self, pipeline_func, pipeline_id):
        version_name = f"{clean_name(self.project_name)}-{uuid.uuid4()}"[:100]
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            return self.client.pipeline_uploads.upload_pipeline_version(
                f.name,
                name=version_name,
                pipelineid=pipeline_id,
                _request_timeout=10000,
            ).id

    def _upload_pipeline(self, pipeline_func):
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            pipeline = self.client.pipeline_uploads.upload_pipeline(
                f.name,
                name=self.project_name,
                description=self.pipeline_description,
                _request_timeout=10000,
            )
            return (pipeline.id, pipeline.default_version.id)

    def _ensure_experiment_exists(self, experiment_name):
        try:
            experiment = self.client.get_experiment(
                experiment_name=experiment_name)
            self.log.info(f"Existing experiment found: {experiment.id}")
        except ValueError as e:
            if not str(e).startswith("No experiment is found"):
                raise

            experiment = self.client.create_experiment(experiment_name)
            self.log.info(f"New experiment created: {experiment.id}")

        return experiment.id

    def schedule(self, experiment_name, cron_expression):
        experiment_id = self._ensure_experiment_exists(experiment_name)
        pipeline_id = self._get_pipeline_id(self.project_name)
        self._disable_runs(experiment_id, pipeline_id)
        self.client.create_recurring_run(
            experiment_id,
            f"{self.project_name} on {cron_expression}",
            cron_expression=cron_expression,
            pipeline_id=pipeline_id,
        )
        self.log.info("Pipeline scheduled to %s", cron_expression)

    def _disable_runs(self, experiment_id, pipeline_id):
        runs = self.client.list_recurring_runs(experiment_id=experiment_id)
        if runs.jobs is not None:
            my_runs = [
                job for job in runs.jobs
                if job.pipeline_spec.pipeline_id == pipeline_id
            ]
            for job in my_runs:
                self.client.jobs.delete_job(job.id)
                self.log.info(f"Previous schedule deleted {job.id}")
Esempio n. 9
0
def run_pipeline(pipeline,
                 arguments=None,
                 experiment=None,
                 run=None,
                 namespace=None,
                 artifact_path=None,
                 ops=None,
                 url=None,
                 remote=False):
    """remote KubeFlow pipeline execution

    Submit a workflow task to KFP via mlrun API service

    :param pipeline   KFP pipeline function or path to .yaml/.zip pipeline file
    :param arguments  pipeline arguments
    :param experiment experiment name
    :param run        optional, run name
    :param namespace  Kubernetes namespace (if not using default)
    :param url        optional, url to mlrun API service
    :param artifact_path  target location/url for mlrun artifacts
    :param ops        additional operators (.apply() to all pipeline functions)
    :param remote     use mlrun remote API service vs direct KFP APIs

    :return kubeflow pipeline id
    """

    namespace = namespace or mlconf.namespace
    arguments = arguments or {}
    if remote or url:
        mldb = get_run_db(url).connect()
        if mldb.kind != 'http':
            raise ValueError(
                'run pipeline require access to remote api-service'
                ', please set the dbpath url')
        id = mldb.submit_pipeline(pipeline,
                                  arguments,
                                  experiment=experiment,
                                  run=run,
                                  namespace=namespace,
                                  ops=ops,
                                  artifact_path=artifact_path)

    else:
        client = Client(namespace=namespace)
        if isinstance(pipeline, str):
            experiment = client.create_experiment(name=experiment)
            run_result = client.run_pipeline(experiment.id,
                                             run,
                                             pipeline,
                                             params=arguments)
        else:
            conf = new_pipe_meta(artifact_path, ops)
            run_result = client.create_run_from_pipeline_func(
                pipeline,
                arguments,
                run_name=run,
                experiment_name=experiment,
                pipeline_conf=conf)

        id = run_result.run_id
    logger.info('Pipeline run id={}, check UI or DB for progress'.format(id))
    return id
Esempio n. 10
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = args.testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = args.testname + '_sample'
    ###### Test-specific arguments #######
    if args.testname == 'tfx_cab_classification':
        params = {
            'output': args.output,
            'project': 'ml-pipeline-test',
            'column-names':
            'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json',
            'evaluation':
            'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv',
            'train':
            'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv',
            'hidden-layer-size': '5',
            'steps': '5'
        }
    elif args.testname == 'xgboost_training_cm':
        params = {
            'output': args.output,
            'project': 'ml-pipeline-test',
            'train-data':
            'gs://ml-pipeline-dataset/sample-test/sfpd/train_50.csv',
            'eval-data':
            'gs://ml-pipeline-dataset/sample-test/sfpd/eval_20.csv',
            'schema': 'gs://ml-pipeline-dataset/sample-test/sfpd/schema.json',
            'rounds': '20',
            'workers': '2'
        }
    else:
        # Basic tests require no additional params.
        params = {}

    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    try:
        start_time = datetime.now()
        if args.testname == 'xgboost_training_cm':
            response = client.wait_for_run_completion(run_id, 1800)
        else:
            response = client.wait_for_run_completion(run_id, 1200)
        succ = (response.run.status.lower() == 'succeeded')
        end_time = datetime.now()
        elapsed_time = (end_time - start_time).seconds
        utils.add_junit_test(test_cases, 'job completion', succ,
                             'waiting for job completion failure',
                             elapsed_time)
    finally:
        ###### Output Argo Log for Debugging ######
        workflow_json = client._get_workflow_json(run_id)
        workflow_id = workflow_json['metadata']['name']
        argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
            args.namespace, workflow_id))
        print('=========Argo Workflow Log=========')
        print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Validate the results for specific test cases ######
    #TODO: Add result check for tfx-cab-classification after launch.
    if args.testname == 'xgboost_training_cm':
        cm_tar_path = './confusion_matrix.tar.gz'
        utils.get_artifact_in_minio(workflow_json, 'confusion-matrix',
                                    cm_tar_path, 'mlpipeline-ui-metadata')
        with tarfile.open(cm_tar_path) as tar_handle:
            file_handles = tar_handle.getmembers()
            assert len(file_handles) == 1

            with tar_handle.extractfile(file_handles[0]) as f:
                cm_data = f.read()
                utils.add_junit_test(test_cases, 'confusion matrix format',
                                     (len(cm_data) > 0),
                                     'the confusion matrix file is empty')

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
Esempio n. 11
0
class KubeflowClient:
    """
    A wrapper of the existing Kubeflow Pipelines Client which enriches it to 
    be able to access more of the Kubeflow Pipelines API.
    """
    def __init__(self,
                 host: Optional[str] = None,
                 client_id: Optional[str] = None,
                 namespace: Optional[str] = "kubeflow"):
        """
        Instandiate a new KubeflowClient

        Args:
            host (str): The host we can find the Kubeflow API at (e.g. https://{APP_NAME}.endpoints.{PROJECT_ID}.cloud.goog/pipeline)
            client_id (str): The IAP client id we can use for authorisate (e.g. "XXXXXX-XXXXXXXXX.apps.googleusercontent.com")
            namespace (str): The Kuberenetes / Kubeflow namespace to deploy to (e.g. kubeflow)
        """
        self.host = host
        self.client_id = client_id
        self.namespace = namespace

        logging.info(f"KubeflowClient: host: {host}, client_id: {client_id}")
        self.kfp_client = Client(host, client_id, namespace)

        self.config = self.kfp_client._load_config(self.host, self.client_id,
                                                   self.namespace, None, None)

        # print(f"kfp auth:")
        # print(f"\thost: {self.host}")
        # print(f"\tclient_id: {self.client_id}")
        # print(f"\tnamespace: {self.namespace}")
        # print(f"\tapi_key: {self.config.api_key}")
        self.kfp_pipelines = self._connect_pipelines_api()
        self.kfp_runs = self._connect_runs_api()
        self.kfp_jobs = self._connect_jobs_api()

    def create_pipeline(self, pipeline_func, pipeline_name):
        """
        Create a new Kubeflow Pipeline using the provided pipeline function

        Args:
            pipeline_func: The method decorated by @dsl.pipeline which defines the pipeline

        Returns:
            The Kubeflow Pipeline object created
        """

        try:
            (_, pipeline_package_path) = tempfile.mkstemp(suffix=".zip")
            compiler.Compiler().compile(pipeline_func, pipeline_package_path)

            logging.info(f"Compiled piopeline to: {pipeline_package_path}")

            return self.kfp_client.upload_pipeline(pipeline_package_path,
                                                   pipeline_name)
        finally:
            pass
            # os.remove(pipeline_package_path)

    def create_experiment(self, experiment_name):
        """
        Create a new Kubeflow Pipelines Experiment (grouping of pipeliens / runs)

        Args:
            experiment_name (str): The name of the experiment

        Returns:
            The Kubeflow experiement object created
        """
        return self.kfp_client.create_experiment(name=experiment_name)

    def list_experiments(self):
        """
        List the Experiments in the current namespace

        Returns:
            A list of all the Experiments
        """

        all_experiments = list()
        next_page_token = ""
        while next_page_token is not None:
            response = self.kfp_client.list_experiments(
                page_size=100, page_token=next_page_token)
            if response.experiments is None:
                break
            all_experiments.extend(response.experiments)
            next_page_token = response.next_page_token

        count = len(all_experiments)
        # print(f"list_experiments: found {count}")

        return all_experiments

    def find_job(self, job_name):
        """
        Look up a job by its name (in the current namespace).  Returns
        None if the job cannot be found

        Args:
            job_name (str): The name of the job to find

        Returns:
            A reference to the job if found, and None if not.
        """
        jobs = self.list_jobs()
        if jobs is None:
            return None

        for j in jobs:
            if j.name == job_name:
                return j
        return None

    def list_jobs(self):
        """
        List the Jobs in the current namespace

        Returns:
            A list of all the Jobs
        """

        all_jobs = list()
        next_page_token = ""
        while next_page_token is not None:
            response = self.kfp_jobs.list_jobs(page_size=100,
                                               page_token=next_page_token)
            if response.jobs is None:
                break
            all_jobs.extend(response.jobs)
            next_page_token = response.next_page_token

        count = len(all_jobs)
        # print(f"all_jobs: found {count}")

        return all_jobs

    def delete_job(self, job):
        """
        Delete a `Job` using its job.id

        Args:
            job (KubeflowJob): A `Job` object to delete

        Returns:
            True if the `Job` was deleted succesfully 
        """
        self.kfp_jobs.delete_job(id=job.id)
        return True

    def create_job(self,
                   name: str,
                   pipeline,
                   experiment,
                   description=None,
                   enabled=True,
                   max_concurrency=1,
                   cron=None):
        """
        Create a new Kubeflow Pipelines Job

        Args:
            name (str): The name of the `Job`
            pipeline (Pipeline): The `Pipeline` object to execute when the `Job` is called
            experiment (Experiment): The `Experiment` object to create the `Job` in.
            description (str): A description of what the `Job` is all about
            enabled (bool): Should be `Job` be enabled?
            max_concurrency (int): How many concurrent executions of the `Job` are allowed?
            cron (str): The CRON expression to use to execute the job periodicalls

        Returns:
            The Kubeflow API response object.
        """

        key = kfp_server_api.models.ApiResourceKey(
            id=experiment.id,
            type=kfp_server_api.models.ApiResourceType.EXPERIMENT)

        reference = kfp_server_api.models.ApiResourceReference(
            key, kfp_server_api.models.ApiRelationship.OWNER)

        spec = kfp_server_api.models.ApiPipelineSpec(pipeline_id=pipeline.id)

        trigger = None
        if cron is not None:
            cron_schedule = kfp_server_api.models.api_cron_schedule.ApiCronSchedule(
                cron=cron)
            trigger = kfp_server_api.models.api_trigger.ApiTrigger(
                cron_schedule=cron_schedule)

        run_body = kfp_server_api.models.ApiJob(
            name=name,
            description=description,
            pipeline_spec=spec,
            resource_references=[reference],
            enabled=True,
            trigger=trigger,
            max_concurrency=str(max_concurrency),
        )

        response = self.kfp_jobs.create_job(body=run_body)
        return response

    def list_runs(self, experiment_name):
        """
        List the `Runs` in the specified Exper`iment

        Args:
            experiment_name (str): The name of the `Experiment`

        Returns:
            A list of all the `Runs` in the current `Experiment`
        """
        experiment = self.get_experiment(experiment_name=experiment_name)
        all_runs = list()
        next_page_token = ""
        while next_page_token is not None:
            response = self.kfp_client.list_runs(page_size=100,
                                                 page_token=next_page_token)
            if response.runs is None:
                break
            all_runs.extend(response.runs)
            next_page_token = response.next_page_token

        run_count = len(all_runs)
        # print(f"list_runs: found {run_count}")
        return all_runs

    def list_pipelines(self):
        """
        List the `Pipelines` in the current namespace

        Returns:
            A list of all the `Pipelines` in the current `Experiment`
        """
        all_pipelines = list()
        response = self.kfp_client.list_pipelines(page_size=100)
        next_page_token = ""
        while next_page_token is not None:
            response = self.kfp_client.list_pipelines(
                page_size=100, page_token=next_page_token)
            if response.pipelines is None:
                break
            all_pipelines.extend(response.pipelines)
            next_page_token = response.next_page_token

        pipeline_count = len(all_pipelines)
        # print(f"list_pipelines: found {pipeline_count}")
        return all_pipelines

    def find_experiment(self, id=None, name=None):
        """
        Look up an `Experiment` by its name or id.  Returns
        None if the `Experiment` cannot be found.  Both `id` and
        `name` are optional, but atleast one must be provided.
        Where both a provided, the function will return with the
        first `Experiment` matching either id or name.

        Args:
            id (str): The `id` of the `Experiment` to find
            name (string): The `name` of the `Experiment` to find

        Returns:
            A reference to the `Experiment` if found, and None if not.
        """
        experiments = self.list_experiments()
        if experiments is None:
            return None
        for e in experiments:
            if e.name == name:
                return e
            if e.id == id:
                return e

        return None

    def find_pipeline(self, name):
        """
        Look up a `Pipeline`  by its name (in the current namespace).  Returns
        None if the `Pipeline` cannot be found

        Args:
            name (str): The name of the `Pipeline` to find

        Returns:
            A reference to the `Pipeline` if found, and `None` if not.
        """

        pipelines = self.list_pipelines()
        if pipelines is None:
            return None

        for p in pipelines:
            if p.name == name:
                return p
        return None

    def delete_pipeline(self, pipeline):
        """
        Delete the specified `Pipeline`

        Args:
            pipeline: The pipeline object to delete

        Returns:
            True if successfull
        """
        # Go through all my pipelines to find the one to delete
        self.kfp_pipelines.delete_pipeline(pipeline.id)
        return True

    def _connect_pipelines_api(self):
        """
            Create a new PipelineServiceApi client
        """
        api_client = kfp_server_api.api_client.ApiClient(self.config)
        pipelines_api = kfp_server_api.api.pipeline_service_api.PipelineServiceApi(
            api_client)
        return pipelines_api

    def _connect_runs_api(self):
        """
            Create a new PipelineServiceApi client
        """
        api_client = kfp_server_api.api_client.ApiClient(self.config)
        runs_api = kfp_server_api.api.run_service_api.RunServiceApi(api_client)
        return runs_api

    def _connect_jobs_api(self):
        """
            Create a new PipelineServiceApi client
        """
        api_client = kfp_server_api.api_client.ApiClient(self.config)
        runs_api = kfp_server_api.api.job_service_api.JobServiceApi(api_client)
        return runs_api
Esempio n. 12
0
class KubeflowClient(object):

    log = logging.getLogger(__name__)

    def __init__(self, config, project_name, context):
        token = self.obtain_id_token()
        self.host = config.host
        self.client = Client(self.host, existing_token=token)
        self.project_name = project_name
        self.context = context
        dsl.ContainerOp._DISABLE_REUSABLE_COMPONENT_WARNING = True
        self.volume_meta = config.run_config.volume

    def list_pipelines(self):
        pipelines = self.client.list_pipelines(page_size=30).pipelines
        return tabulate(map(lambda x: [x.name, x.id], pipelines),
                        headers=["Name", "ID"])

    def run_once(
        self,
        pipeline,
        image,
        experiment_name,
        run_name,
        wait,
        image_pull_policy="IfNotPresent",
    ) -> None:
        run = self.client.create_run_from_pipeline_func(
            self.generate_pipeline(pipeline, image, image_pull_policy),
            arguments={},
            experiment_name=experiment_name,
            run_name=run_name,
        )

        if wait:
            run.wait_for_run_completion(timeout=WAIT_TIMEOUT)

    def obtain_id_token(self):
        from google.auth.transport.requests import Request
        from google.oauth2 import id_token
        from google.auth.exceptions import DefaultCredentialsError

        client_id = os.environ.get(IAP_CLIENT_ID, None)

        jwt_token = None

        if not client_id:
            self.log.info(
                "No IAP_CLIENT_ID provided, skipping custom IAP authentication"
            )
            return jwt_token

        try:
            self.log.debug("Obtaining JWT token for %s." + client_id)
            jwt_token = id_token.fetch_id_token(Request(), client_id)
            self.log.info("Obtained JWT token for MLFLOW connectivity.")
        except DefaultCredentialsError as ex:
            self.log.warning(
                str(ex) +
                (" Note that this authentication method does not work with default"
                 " credentials obtained via 'gcloud auth application-default login'"
                 " command. Refer to documentation on how to configure service account"
                 " locally"
                 " (https://cloud.google.com/docs/authentication/production#manually)"
                 ))
        except Exception as e:
            self.log.error("Failed to obtain IAP access token. " + str(e))
        finally:
            return jwt_token

    def generate_pipeline(self, pipeline, image, image_pull_policy):
        @dsl.pipeline(
            name=self.project_name,
            description="Kubeflow pipeline for Kedro project",
        )
        def convert_kedro_pipeline_to_kfp() -> None:
            """Convert from a Kedro pipeline into a kfp container graph."""

            node_volumes = (_setup_volumes()
                            if self.volume_meta is not None else {})
            node_dependencies = self.context.pipelines.get(
                pipeline).node_dependencies
            kfp_ops = _build_kfp_ops(node_dependencies, node_volumes)
            for node, dependencies in node_dependencies.items():
                for dependency in dependencies:
                    kfp_ops[node.name].after(kfp_ops[dependency.name])

        def _setup_volumes():
            vop = dsl.VolumeOp(
                name="data-volume-create",
                resource_name="data-volume",
                size=self.volume_meta.size,
                modes=self.volume_meta.access_modes,
                storage_class=self.volume_meta.storageclass,
            )
            if self.volume_meta.skip_init:
                return {"/home/kedro/data": vop.volume}
            else:
                volume_init = dsl.ContainerOp(
                    name="data-volume-init",
                    image=image,
                    command=["sh", "-c"],
                    arguments=[
                        " ".join([
                            "cp",
                            "--verbose",
                            "-r",
                            "/home/kedro/data/*",
                            "/home/kedro/datavolume",
                        ])
                    ],
                    pvolumes={"/home/kedro/datavolume": vop.volume},
                )
                volume_init.container.set_image_pull_policy(image_pull_policy)
                return {"/home/kedro/data": volume_init.pvolume}

        def _build_kfp_ops(node_dependencies: Dict[Node, Set[Node]],
                           node_volumes: Dict) -> Dict[str, dsl.ContainerOp]:
            """Build kfp container graph from Kedro node dependencies. """
            kfp_ops = {}

            env = [
                V1EnvVar(name=IAP_CLIENT_ID,
                         value=os.environ.get(IAP_CLIENT_ID, ""))
            ]

            if is_mlflow_enabled():
                kfp_ops["mlflow-start-run"] = dsl.ContainerOp(
                    name="mlflow-start-run",
                    image=image,
                    command=["kedro"],
                    arguments=[
                        "kubeflow",
                        "mlflow-start",
                        dsl.RUN_ID_PLACEHOLDER,
                    ],
                    file_outputs={"mlflow_run_id": "/tmp/mlflow_run_id"},
                )
                kfp_ops["mlflow-start-run"].container.set_image_pull_policy(
                    image_pull_policy)
                env.append(
                    V1EnvVar(
                        name="MLFLOW_RUN_ID",
                        value=kfp_ops["mlflow-start-run"].output,
                    ))

            for node in node_dependencies:
                name = _clean_name(node.name)
                kfp_ops[node.name] = dsl.ContainerOp(
                    name=name,
                    image=image,
                    command=["kedro"],
                    arguments=["run", "--node", node.name],
                    pvolumes=node_volumes,
                    container_kwargs={"env": env},
                )
                kfp_ops[node.name].container.set_image_pull_policy(
                    image_pull_policy)

            return kfp_ops

        return convert_kedro_pipeline_to_kfp

    def compile(self,
                pipeline,
                image,
                output,
                image_pull_policy="IfNotPresent"):
        Compiler().compile(
            self.generate_pipeline(pipeline, image, image_pull_policy), output)
        self.log.info("Generated pipeline definition was saved to %s" % output)

    def upload(self, pipeline, image, image_pull_policy="IfNotPresent"):
        pipeline = self.generate_pipeline(pipeline, image, image_pull_policy)

        if self._pipeline_exists(self.project_name):
            pipeline_id = self._get_pipeline_id(self.project_name)
            version_id = self._upload_pipeline_version(pipeline, pipeline_id,
                                                       self.project_name)
            self.log.info("New version of pipeline created: %s", version_id)
        else:
            (pipeline_id,
             version_id) = self._upload_pipeline(pipeline, self.project_name)
            self.log.info("Pipeline created")

        self.log.info(
            f"Pipeline link: {self.host}/#/pipelines/details/%s/version/%s",
            pipeline_id,
            version_id,
        )

    def _pipeline_exists(self, pipeline_name):
        return self._get_pipeline_id(pipeline_name) is not None

    def _get_pipeline_id(self, pipeline_name):
        pipelines = self.client.pipelines.list_pipelines(filter=json.dumps({
            "predicates": [{
                "key": "name",
                "op": 1,
                "string_value": pipeline_name,
            }]
        })).pipelines

        if pipelines:
            return pipelines[0].id

    def _upload_pipeline_version(self, pipeline_func, pipeline_id,
                                 pipeline_name):
        version_name = f"{_clean_name(pipeline_name)}-{uuid.uuid4()}"[:100]
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            return self.client.pipeline_uploads.upload_pipeline_version(
                f.name, name=version_name, pipelineid=pipeline_id).id

    def _upload_pipeline(self, pipeline_func, pipeline_name):
        with NamedTemporaryFile(suffix=".yaml") as f:
            Compiler().compile(pipeline_func, f.name)
            pipeline = self.client.pipeline_uploads.upload_pipeline(
                f.name, name=pipeline_name)
            return (pipeline.id, pipeline.default_version.id)

    def _ensure_experiment_exists(self, experiment_name):
        try:
            experiment = self.client.get_experiment(
                experiment_name=experiment_name)
            self.log.info(f"Existing experiment found: {experiment.id}")
        except ValueError as e:
            if not str(e).startswith("No experiment is found"):
                raise

            experiment = self.client.create_experiment(experiment_name)
            self.log.info(f"New experiment created: {experiment.id}")

        return experiment.id

    def schedule(self, experiment_name, cron_expression):
        experiment_id = self._ensure_experiment_exists(experiment_name)
        pipeline_id = self._get_pipeline_id(self.project_name)
        self._disable_runs(experiment_id, pipeline_id)
        self.client.create_recurring_run(
            experiment_id,
            f"{self.project_name} on {cron_expression}",
            cron_expression=cron_expression,
            pipeline_id=pipeline_id,
        )
        self.log.info("Pipeline scheduled to %s", cron_expression)

    def _disable_runs(self, experiment_id, pipeline_id):
        runs = self.client.list_recurring_runs(experiment_id=experiment_id)
        if runs.jobs is not None:
            my_runs = [
                job for job in runs.jobs
                if job.pipeline_spec.pipeline_id == pipeline_id
            ]
            for job in my_runs:
                self.client.jobs.delete_job(job.id)
                self.log.info(f"Previous schedule deleted {job.id}")
Esempio n. 13
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'Resnet CMLE Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = 'resnet cmle sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'cmle_sample'
    params = {
        'output': args.output,
        'project_id': 'ml-pipeline-test',
        'region': 'us-central1',
        'model': 'bolts',
        'version': 'beta1',
        'tf_version':
        '1.9',  # Watch out! If 1.9 is no longer supported we need to set it to a newer version.
        'train_csv':
        'gs://ml-pipeline-dataset/sample-test/bolts/bolt_images_train_sample1000.csv',
        'validation_csv':
        'gs://ml-pipeline-dataset/sample-test/bolts/bolt_images_validate_sample200.csv',
        'labels': 'gs://bolts_image_dataset/labels.txt',
        'depth': 50,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'steps_per_eval': 128,
        'train_steps': 128,
        'num_train_images': 1000,
        'num_eval_images': 200,
        'num_label_classes': 10
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    try:
        start_time = datetime.now()
        response = client.wait_for_run_completion(run_id, 1800)
        succ = (response.run.status.lower() == 'succeeded')
        end_time = datetime.now()
        elapsed_time = (end_time - start_time).seconds
        utils.add_junit_test(test_cases, 'job completion', succ,
                             'waiting for job completion failure',
                             elapsed_time)
    finally:
        ###### Output Argo Log for Debugging ######
        workflow_json = client._get_workflow_json(run_id)
        workflow_id = workflow_json['metadata']['name']
        argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
            args.namespace, workflow_id))
        print("=========Argo Workflow Log=========")
        print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
Esempio n. 14
0
def main():
  args = parse_arguments()
  test_cases = []
  test_name = 'TFX Sample Test'

  ###### Initialization ######
  client = Client(namespace=args.namespace)

  ###### Check Input File ######
  utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated')
  if not os.path.exists(args.input):
    utils.write_junit_xml(test_name, args.result, test_cases)
    exit()

  ###### Create Experiment ######
  experiment_name = 'TFX sample experiment'
  response = client.create_experiment(experiment_name)
  experiment_id = response.id
  utils.add_junit_test(test_cases, 'create experiment', True)

  ###### Create Job ######
  job_name = 'TFX_sample'
  params = {'output': args.output,
            'project': 'ml-pipeline-test',
            'column-names': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json',
            'evaluation': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv',
            'train': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv',
            'hidden-layer-size': '5',
            'steps': '5'}
  response = client.run_pipeline(experiment_id, job_name, args.input, params)
  run_id = response.id
  utils.add_junit_test(test_cases, 'create pipeline run', True)


  ###### Monitor Job ######
  start_time = datetime.now()
  response = client.wait_for_run_completion(run_id, 1200)
  succ = (response.run.status.lower()=='succeeded')
  end_time = datetime.now()
  elapsed_time = (end_time - start_time).seconds
  utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time)
  if not succ:
    utils.write_junit_xml(test_name, args.result, test_cases)
    exit()

  ###### Output Argo Log for Debugging ######
  workflow_json = client._get_workflow_json(run_id)
  workflow_id = workflow_json['metadata']['name']
  argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(args.namespace, workflow_id))
  print("=========Argo Workflow Log=========")
  print(argo_log)

  ###### Validate the results ######
  #TODO: enable after launch
  #   model analysis html is validated
  # argo_workflow_id = workflow_json['metadata']['name']
  # gcs_html_path = os.path.join(os.path.join(args.output, str(argo_workflow_id)), 'analysis/output_display.html')
  # print('Output display HTML path is ' + gcs_html_path)
  # utils.run_bash_command('gsutil cp ' + gcs_html_path + './')
  # display_file = open('./output_display.html', 'r')
  # is_next_line_state = False
  # for line in display_file:
  #   if is_next_line_state:
  #     state = line.strip()
  #     break
  #   if line.strip() == '<script type="application/vnd.jupyter.widget-state+json">':
  #     is_next_line_state = True
  # import json
  # state_json = json.loads(state)
  # succ = ('state' in state_json and 'version_major' in state_json and 'version_minor' in state_json)
  # utils.add_junit_test(test_cases, 'output display html', succ, 'the state json does not contain state, version_major, or version_inor')

  ###### Delete Job ######
  #TODO: add deletion when the backend API offers the interface.

  ###### Write out the test result in junit xml ######
  utils.write_junit_xml(test_name, args.result, test_cases)
Esempio n. 15
0
  def check(self):
    """Run sample test and check results."""
    test_cases = []
    test_name = self._testname + ' Sample Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(self._input), 'yaml file is not generated')
    if not os.path.exists(self._input):
      utils.write_junit_xml(test_name, self._result, test_cases)
      print('Error: job not found.')
      exit(1)

    ###### Create Experiment ######
    experiment_name = self._testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = self._testname + '_sample'
    ###### Figure out arguments from associated config files. #######
    test_args = {}
    try:
      with open(DEFAULT_CONFIG, 'r') as f:
        raw_args = yaml.safe_load(f)
    except yaml.YAMLError as yamlerr:
      raise RuntimeError('Illegal default config:{}'.format(yamlerr))
    except OSError as ose:
      raise FileExistsError('Default config not found:{}'.format(ose))
    else:
      test_timeout = raw_args['test_timeout']

    try:
      with open(os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname), 'r') as f:
          raw_args = yaml.safe_load(f)
    except yaml.YAMLError as yamlerr:
      print('No legit yaml config file found, use default args:{}'.format(yamlerr))
    except OSError as ose:
      print('Config file with the same name not found, use default args:{}'.format(ose))
    else:
      test_args.update(raw_args['arguments'])
      if 'output' in test_args.keys():  # output is a special param that has to be specified dynamically.
        test_args['output'] = self._output
      if 'test_timeout' in raw_args.keys():
        test_timeout = raw_args['test_timeout']

    response = client.run_pipeline(experiment_id, job_name, self._input, test_args)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    try:
      start_time = datetime.now()
      response = client.wait_for_run_completion(run_id, test_timeout)
      succ = (response.run.status.lower() == 'succeeded')
      end_time = datetime.now()
      elapsed_time = (end_time - start_time).seconds
      utils.add_junit_test(test_cases, 'job completion', succ,
                           'waiting for job completion failure', elapsed_time)
    finally:
      ###### Output Argo Log for Debugging ######
      workflow_json = client._get_workflow_json(run_id)
      workflow_id = workflow_json['metadata']['name']
      argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        self._namespace, workflow_id))
      print('=========Argo Workflow Log=========')
      print(argo_log)

    if not succ:
      utils.write_junit_xml(test_name, self._result, test_cases)
      exit(1)

    ###### Validate the results for specific test cases ######
    #TODO: Add result check for tfx-cab-classification after launch.
    if self._testname == 'xgboost_training_cm':
      # For xgboost sample, check its confusion matrix.
      cm_tar_path = './confusion_matrix.tar.gz'
      utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path,
                                  'mlpipeline-ui-metadata')
      with tarfile.open(cm_tar_path) as tar_handle:
        file_handles = tar_handle.getmembers()
        assert len(file_handles) == 1

        with tar_handle.extractfile(file_handles[0]) as f:
          cm_data = f.read()
          utils.add_junit_test(test_cases, 'confusion matrix format',
                               (len(cm_data) > 0),
                               'the confusion matrix file is empty')

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, self._result, test_cases)
Esempio n. 16
0
class PySampleChecker(object):
    def __init__(self, testname, input, output, result, namespace='kubeflow'):
        """Util class for checking python sample test running results.

    :param testname: test name.
    :param input: The path of a pipeline file that will be submitted.
    :param output: The path of the test output.
    :param result: The path of the test result that will be exported.
    :param namespace: namespace of the deployed pipeline system. Default: kubeflow
    """
        self._testname = testname
        self._input = input
        self._output = output
        self._result = result
        self._namespace = namespace
        self._run_pipeline = None
        self._test_timeout = None

        self._test_cases = []
        self._test_name = self._testname + ' Sample Test'

        self._client = None
        self._experiment_id = None
        self._job_name = None
        self._test_args = None
        self._run_id = None

    def run(self):
        """Run compiled KFP pipeline."""

        ###### Initialization ######
        host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
        self._client = Client(host=host)

        ###### Check Input File ######
        utils.add_junit_test(self._test_cases, 'input generated yaml file',
                             os.path.exists(self._input),
                             'yaml file is not generated')
        if not os.path.exists(self._input):
            utils.write_junit_xml(self._test_name, self._result,
                                  self._test_cases)
            print('Error: job not found.')
            exit(1)

        ###### Create Experiment ######
        experiment_name = self._testname + ' sample experiment'
        response = self._client.create_experiment(experiment_name)
        self._experiment_id = response.id
        utils.add_junit_test(self._test_cases, 'create experiment', True)

        ###### Create Job ######
        self._job_name = self._testname + '_sample'
        ###### Figure out arguments from associated config files. #######
        self._test_args = {}
        config_schema = yamale.make_schema(SCHEMA_CONFIG)
        try:
            with open(DEFAULT_CONFIG, 'r') as f:
                raw_args = yaml.safe_load(f)
            default_config = yamale.make_data(DEFAULT_CONFIG)
            yamale.validate(
                config_schema,
                default_config)  # If fails, a ValueError will be raised.
        except yaml.YAMLError as yamlerr:
            raise RuntimeError('Illegal default config:{}'.format(yamlerr))
        except OSError as ose:
            raise FileExistsError('Default config not found:{}'.format(ose))
        else:
            self._test_timeout = raw_args['test_timeout']
            self._run_pipeline = raw_args['run_pipeline']

        try:
            with open(
                    os.path.join(CONFIG_DIR,
                                 '%s.config.yaml' % self._testname), 'r') as f:
                raw_args = yaml.safe_load(f)
            test_config = yamale.make_data(
                os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname))
            yamale.validate(
                config_schema,
                test_config)  # If fails, a ValueError will be raised.
        except yaml.YAMLError as yamlerr:
            print(
                'No legit yaml config file found, use default args:{}'.format(
                    yamlerr))
        except OSError as ose:
            print(
                'Config file with the same name not found, use default args:{}'
                .format(ose))
        else:
            self._test_args.update(raw_args['arguments'])
            if 'output' in self._test_args.keys(
            ):  # output is a special param that has to be specified dynamically.
                self._test_args['output'] = self._output
            if 'test_timeout' in raw_args.keys():
                self._test_timeout = raw_args['test_timeout']
            if 'run_pipeline' in raw_args.keys():
                self._run_pipeline = raw_args['run_pipeline']

        # Submit for pipeline running.
        if self._run_pipeline:
            response = self._client.run_pipeline(self._experiment_id,
                                                 self._job_name, self._input,
                                                 self._test_args)
            self._run_id = response.id
            utils.add_junit_test(self._test_cases, 'create pipeline run', True)

    def check(self):
        """Check pipeline run results."""
        if self._run_pipeline:
            ###### Monitor Job ######
            try:
                start_time = datetime.now()
                response = self._client.wait_for_run_completion(
                    self._run_id, self._test_timeout)
                succ = (response.run.status.lower() == 'succeeded')
                end_time = datetime.now()
                elapsed_time = (end_time - start_time).seconds
                utils.add_junit_test(self._test_cases, 'job completion', succ,
                                     'waiting for job completion failure',
                                     elapsed_time)
            finally:
                ###### Output Argo Log for Debugging ######
                workflow_json = self._client._get_workflow_json(self._run_id)
                workflow_id = workflow_json['metadata']['name']
                argo_log, _ = utils.run_bash_command(
                    'argo logs -n {} -w {}'.format(self._namespace,
                                                   workflow_id))
                print('=========Argo Workflow Log=========')
                print(argo_log)

            if not succ:
                utils.write_junit_xml(self._test_name, self._result,
                                      self._test_cases)
                exit(1)

            ###### Validate the results for specific test cases ######
            #TODO: Add result check for tfx-cab-classification after launch.
            if self._testname == 'xgboost_training_cm':
                # For xgboost sample, check its confusion matrix.
                cm_tar_path = './confusion_matrix.tar.gz'
                utils.get_artifact_in_minio(workflow_json, 'confusion-matrix',
                                            cm_tar_path,
                                            'mlpipeline-ui-metadata')
                with tarfile.open(cm_tar_path) as tar_handle:
                    file_handles = tar_handle.getmembers()
                    assert len(file_handles) == 1

                    with tar_handle.extractfile(file_handles[0]) as f:
                        cm_data = f.read()
                        utils.add_junit_test(
                            self._test_cases, 'confusion matrix format',
                            (len(cm_data) > 0),
                            'the confusion matrix file is empty')

        ###### Delete Job ######
        #TODO: add deletion when the backend API offers the interface.

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(self._test_name, self._result, self._test_cases)
Esempio n. 17
0
def run_pipeline(
    pipeline,
    arguments=None,
    project=None,
    experiment=None,
    run=None,
    namespace=None,
    artifact_path=None,
    ops=None,
    url=None,
    ttl=None,
):
    """remote KubeFlow pipeline execution

    Submit a workflow task to KFP via mlrun API service

    :param pipeline:   KFP pipeline function or path to .yaml/.zip pipeline file
    :param arguments:  pipeline arguments
    :param experiment: experiment name
    :param run:        optional, run name
    :param namespace:  Kubernetes namespace (if not using default)
    :param url:        optional, url to mlrun API service
    :param artifact_path:  target location/url for mlrun artifacts
    :param ops:        additional operators (.apply() to all pipeline functions)
    :param ttl:        pipeline ttl in secs (after that the pods will be removed)

    :returns: kubeflow pipeline id
    """

    remote = not get_k8s_helper(
        silent=True).is_running_inside_kubernetes_cluster()

    artifact_path = artifact_path or mlconf.artifact_path
    if artifact_path and "{{run.uid}}" in artifact_path:
        artifact_path.replace("{{run.uid}}", "{{workflow.uid}}")
    if artifact_path and "{{run.project}}" in artifact_path:
        if not project:
            raise ValueError("project name must be specified with this" +
                             f" artifact_path template {artifact_path}")
        artifact_path.replace("{{run.project}}", project)
    if not artifact_path:
        raise ValueError("artifact path was not specified")

    namespace = namespace or mlconf.namespace
    arguments = arguments or {}

    if remote or url:
        mldb = get_run_db(url)
        if mldb.kind != "http":
            raise ValueError(
                "run pipeline require access to remote api-service"
                ", please set the dbpath url")
        id = mldb.submit_pipeline(
            pipeline,
            arguments,
            experiment=experiment,
            run=run,
            namespace=namespace,
            ops=ops,
            artifact_path=artifact_path,
        )

    else:
        client = Client(namespace=namespace)
        if isinstance(pipeline, str):
            experiment = client.create_experiment(name=experiment)
            run_result = client.run_pipeline(experiment.id,
                                             run,
                                             pipeline,
                                             params=arguments)
        else:
            conf = new_pipe_meta(artifact_path, ttl, ops)
            run_result = client.create_run_from_pipeline_func(
                pipeline,
                arguments,
                run_name=run,
                experiment_name=experiment,
                pipeline_conf=conf,
            )

        id = run_result.run_id
    logger.info("Pipeline run id={}, check UI or DB for progress".format(id))
    return id
Esempio n. 18
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'XGBoost Sample Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = 'xgboost sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'xgboost_sample'
    params = {
        'output': args.output,
        'project': 'ml-pipeline-test',
        'train-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/train_50.csv',
        'eval-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/eval_20.csv',
        'schema': 'gs://ml-pipeline-dataset/sample-test/sfpd/schema.json',
        'rounds': '20',
        'workers': '2'
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1800)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        args.namespace, workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    ###### If the job fails, skip the result validation ######
    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Validate the results ######
    #   confusion matrix should show three columns for the flower data
    #     target, predicted, count
    cm_tar_path = './confusion_matrix.tar.gz'
    cm_filename = 'mlpipeline-ui-metadata.json'
    utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path)
    tar_handler = tarfile.open(cm_tar_path)
    tar_handler.extractall()

    with open(cm_filename, 'r') as f:
        cm_data = f.read()
        utils.add_junit_test(test_cases, 'confusion matrix format',
                             (len(cm_data) > 0),
                             'the confusion matrix file is empty')

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)