コード例 #1
0
def get_pipline(run_id, wait=0, namespace=None):
    """Get or wait for Pipeline status, wait time in sec"""

    client = Client(namespace=namespace or mlconf.namespace)
    if wait:
        resp = client.wait_for_run_completion(run_id, wait)
    else:
        resp = client.get_run(run_id)
    return resp
コード例 #2
0
    def check(self):
        """ Check the pipeline running results of the notebook sample. """
        test_cases = []
        test_name = self._testname + ' Sample Test'

        ###### Write the script exit code log ######
        utils.add_junit_test(
            test_cases, 'test script execution', (self._exit_code == '0'),
            'test script failure with exit code: ' + self._exit_code)

        try:
            with open(DEFAULT_CONFIG, 'r') as f:
                raw_args = yaml.safe_load(f)
        except yaml.YAMLError as yamlerr:
            raise RuntimeError('Illegal default config:{}'.format(yamlerr))
        except OSError as ose:
            raise FileExistsError('Default config not found:{}'.format(ose))
        else:
            test_timeout = raw_args['test_timeout']

        if self._run_pipeline:
            experiment = self._experiment_name
            ###### Initialization ######
            client = Client(host=self._host)

            ###### Get experiments ######
            experiment_id = client.get_experiment(
                experiment_name=experiment).id

            ###### Get runs ######
            list_runs_response = client.list_runs(page_size=RUN_LIST_PAGE_SIZE,
                                                  experiment_id=experiment_id)

            ###### Check all runs ######
            for run in list_runs_response.runs:
                run_id = run.id
                response = client.wait_for_run_completion(run_id, test_timeout)
                succ = (response.run.status.lower() == 'succeeded')
                utils.add_junit_test(test_cases, 'job completion', succ,
                                     'waiting for job completion failure')

                ###### Output Argo Log for Debugging ######
                workflow_json = client._get_workflow_json(run_id)
                workflow_id = workflow_json['metadata']['name']
                print("Argo Workflow Name: ", workflow_id)
                argo_log, _ = utils.run_bash_command(
                    'argo logs {} -n {}'.format(workflow_id, self._namespace))
                print("=========Argo Workflow Log=========")
                print(argo_log)

                if not succ:
                    utils.write_junit_xml(test_name, self._result, test_cases)
                    exit(1)

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(test_name, self._result, test_cases)
コード例 #3
0
ファイル: run_basic_test.py プロジェクト: rmoorman/pipelines
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    client = Client(namespace=args.namespace)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = args.testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = args.testname + '_sample'
    params = {}
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1200)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        args.namespace, workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #4
0
def test_pipelines(name: str, params: list, fn: Callable):
    """Runs each pipeline that it's been parameterized for, and waits for it to succeed."""

    client = Client('127.0.0.1:8888')
    run = client.create_run_from_pipeline_func(
        fn, arguments={p['name']: p['value']
                       for p in params})
    completed = client.wait_for_run_completion(run.run_id, timeout=1200)
    status = completed.to_dict()['run']['status']
    assert status == 'Succeeded', f'Pipeline status is {status}'
コード例 #5
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Write the script exit code log ######
    utils.add_junit_test(
        test_cases, 'test script execution', (args.exit_code == '0'),
        'test script failure with exit code: ' + args.exit_code)

    if args.experiment is not None:
        ###### Initialization ######
        host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
        client = Client(host=host)

        ###### Get experiments ######
        experiment_id = client.get_experiment(
            experiment_name=args.experiment).id

        ###### Get runs ######
        list_runs_response = client.list_runs(page_size=1000,
                                              experiment_id=experiment_id)

        ###### Check all runs ######
        for run in list_runs_response.runs:
            run_id = run.id
            response = client.wait_for_run_completion(run_id, 1200)
            succ = (response.run.status.lower() == 'succeeded')
            utils.add_junit_test(test_cases, 'job completion', succ,
                                 'waiting for job completion failure')

            ###### Output Argo Log for Debugging ######
            workflow_json = client._get_workflow_json(run_id)
            workflow_id = workflow_json['metadata']['name']
            argo_log, _ = utils.run_bash_command(
                'argo logs -n {} -w {}'.format(args.namespace, workflow_id))
            print("=========Argo Workflow Log=========")
            print(argo_log)

            if not succ:
                utils.write_junit_xml(test_name, args.result, test_cases)
                exit(1)

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #6
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    client = Client(namespace=args.namespace)

    ###### Get experiments ######
    list_experiments_response = client.list_experiments(page_size=100)
    for experiment in list_experiments_response.experiments:
        if experiment.name == args.experiment:
            experiment_id = experiment.id

    ###### Get runs ######
    import kfp_run
    resource_reference_key_type = kfp_run.models.api_resource_type.ApiResourceType.EXPERIMENT
    resource_reference_key_id = experiment_id
    list_runs_response = client.list_runs(
        page_size=1000,
        resource_reference_key_type=resource_reference_key_type,
        resource_reference_key_id=resource_reference_key_id)

    ###### Check all runs ######
    for run in list_runs_response.runs:
        run_id = run.id
        response = client.wait_for_run_completion(run_id, 1200)
        succ = (response.run.status.lower() == 'succeeded')
        utils.add_junit_test(test_cases, 'job completion', succ,
                             'waiting for job completion failure')

        ###### Output Argo Log for Debugging ######
        workflow_json = client._get_workflow_json(run_id)
        workflow_id = workflow_json['metadata']['name']
        argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
            args.namespace, workflow_id))
        print("=========Argo Workflow Log=========")
        print(argo_log)

        if not succ:
            utils.write_junit_xml(test_name, args.result, test_cases)
            exit(1)

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #7
0
    def check(self):
        test_cases = []
        test_name = self._testname + ' Sample Test'

        ###### Write the script exit code log ######
        utils.add_junit_test(test_cases, 'test script execution',
                             (self._exit_code == '0'),
                             'test script failure with exit code: '
                             + self._exit_code)

        if self._experiment is not None:  # Bypassing dsl type check sample.
            ###### Initialization ######
            host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
            client = Client(host=host)

            ###### Get experiments ######
            experiment_id = client.get_experiment(experiment_name=self._experiment).id

            ###### Get runs ######
            list_runs_response = client.list_runs(page_size=_RUN_LIST_PAGE_SIZE,
                                                  experiment_id=experiment_id)

            ###### Check all runs ######
            for run in list_runs_response.runs:
                run_id = run.id
                response = client.wait_for_run_completion(run_id, _TEST_TIMEOUT)
                succ = (response.run.status.lower()=='succeeded')
                utils.add_junit_test(test_cases, 'job completion',
                                     succ, 'waiting for job completion failure')

                ###### Output Argo Log for Debugging ######
                workflow_json = client._get_workflow_json(run_id)
                workflow_id = workflow_json['metadata']['name']
                argo_log, _ = utils.run_bash_command(
                    'argo logs -n {} -w {}'.format(self._namespace, workflow_id))
                print("=========Argo Workflow Log=========")
                print(argo_log)

                if not succ:
                    utils.write_junit_xml(test_name, self._result, test_cases)
                    exit(1)

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(test_name, self._result, test_cases)
コード例 #8
0
def _monitor_kfp_submission(runtime_config: dict, runtime_config_name: str,
                            run_id: str, timeout: int) -> str:
    """Monitor the status of a Kubeflow Pipelines run"""

    try:
        # Authenticate with the KFP server
        auth_info = KFPAuthenticator().authenticate(
            runtime_config.metadata["api_endpoint"].rstrip("/"),
            auth_type_str=runtime_config.metadata.get("auth_type"),
            runtime_config_name=runtime_config_name,
            auth_parm_1=runtime_config.metadata.get("api_username"),
            auth_parm_2=runtime_config.metadata.get("api_password"),
        )
    except AuthenticationError as ae:
        if ae.get_request_history() is not None:
            click.echo(
                "An authentication error was raised. Diagnostic information follows."
            )
            click.echo(ae.request_history_to_string())
        raise click.ClickException(f"Kubeflow authentication failed: {ae}")

    try:
        # Create a Kubeflow Pipelines client. There is no need to use a Tekton client,
        # because the monitoring API is agnostic.
        client = ArgoClient(
            host=runtime_config.metadata["api_endpoint"].rstrip("/"),
            cookies=auth_info.get("cookies", None),
            credentials=auth_info.get("credentials", None),
            existing_token=auth_info.get("existing_token", None),
            namespace=runtime_config.metadata.get("user_namespace"),
        )
        # wait for the run to complete or timeout (API uses seconds as unit - convert)
        run_details = client.wait_for_run_completion(run_id, timeout * 60)
    except TimeoutError:
        # pipeline processing did not finish yet, stop monitoring
        raise
    except Exception as ex:
        # log error and return 'unknown' status
        click.echo(f"Monitoring failed: {type(ex)}: {ex}")
        return "unknown"
    else:
        return run_details.run.status
コード例 #9
0
def test_pipelines():
    status = json.loads(
        check_output([
            "microk8s",
            "kubectl",
            "get",
            "services/ml-pipeline",
            "-nkubeflow",
            "-ojson",
        ]).decode("utf-8"))
    ip = status["spec"]["clusterIP"]
    client = Client(f"http://{ip}:8888")
    run = client.create_run_from_pipeline_func(
        download_and_join,
        arguments={
            "url1":
            "gs://ml-pipeline/sample-data/shakespeare/shakespeare1.txt",
            "url2":
            "gs://ml-pipeline/sample-data/shakespeare/shakespeare2.txt",
        },
    )
    completed = client.wait_for_run_completion(run.run_id, timeout=3600)
    status = completed.to_dict()["run"]["status"]
    assert status == "Succeeded", f"Pipeline cowsay status is {status}"
コード例 #10
0
def wait_for_pipeline_completion(run_id,
                                 timeout=60 * 60,
                                 expected_statuses: List[str] = None,
                                 namespace=None):
    """Wait for Pipeline status, timeout in sec

    :param run_id:     id of pipelines run
    :param timeout:    wait timeout in sec
    :param expected_statuses:  list of expected statuses, one of [ Succeeded | Failed | Skipped | Error ], by default
                               [ Succeeded ]
    :param namespace:  k8s namespace if not default

    :return: kfp run dict
    """
    if expected_statuses is None:
        expected_statuses = [RunStatuses.succeeded]
    namespace = namespace or mlconf.namespace
    remote = not get_k8s_helper(
        silent=True).is_running_inside_kubernetes_cluster()
    logger.debug(f"Waiting for run completion."
                 f" run_id: {run_id},"
                 f" expected_statuses: {expected_statuses},"
                 f" timeout: {timeout},"
                 f" remote: {remote},"
                 f" namespace: {namespace}")

    if remote:
        mldb = get_run_db()

        def get_pipeline_if_completed(run_id, namespace=namespace):
            resp = mldb.get_pipeline(run_id, namespace=namespace)
            status = resp["run"]["status"]
            if status not in RunStatuses.stable_statuses():

                # TODO: think of nicer liveness indication and make it re-usable
                # log '.' each retry as a liveness indication
                logger.debug(".")
                raise RuntimeError("pipeline run has not completed yet")

            return resp

        if mldb.kind != "http":
            raise ValueError(
                "get pipeline require access to remote api-service"
                ", please set the dbpath url")

        resp = retry_until_successful(
            10,
            timeout,
            logger,
            False,
            get_pipeline_if_completed,
            run_id,
            namespace=namespace,
        )
    else:
        client = Client(namespace=namespace)
        resp = client.wait_for_run_completion(run_id, timeout)
        if resp:
            resp = resp.to_dict()

    status = resp["run"]["status"] if resp else "unknown"
    if expected_statuses:
        if status not in expected_statuses:
            raise RuntimeError(f"run status {status} not in expected statuses")

    logger.debug(f"Finished waiting for pipeline completion."
                 f" run_id: {run_id},"
                 f" status: {status},"
                 f" namespace: {namespace}")

    return resp
コード例 #11
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'Kubeflow Sample Test'

    ###### Initialization ######
    client = Client()

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit()

    ###### Create Experiment ######
    experiment_name = 'kubeflow sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'kubeflow_sample'
    params = {
        'output': args.output,
        'project': 'ml-pipeline-test',
        'evaluation': 'gs://ml-pipeline-dataset/sample-test/flower/eval15.csv',
        'train': 'gs://ml-pipeline-dataset/sample-test/flower/train30.csv',
        'hidden-layer-size': '10,5',
        'steps': '5'
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1200)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)
    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit()

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    #TODO: remove the namespace dependency or make is configurable.
    argo_log, _ = utils.run_bash_command(
        'argo logs -n kubeflow -w {}'.format(workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    ###### Validate the results ######
    #   confusion matrix should show three columns for the flower data
    #     target, predicted, count
    cm_tar_path = './confusion_matrix.tar.gz'
    cm_filename = 'mlpipeline-ui-metadata.json'
    utils.get_artifact_in_minio(workflow_json, 'confusionmatrix', cm_tar_path)
    tar_handler = tarfile.open(cm_tar_path)
    tar_handler.extractall()

    with open(cm_filename, 'r') as f:
        cm_data = json.load(f)
        utils.add_junit_test(
            test_cases, 'confusion matrix format',
            (len(cm_data['outputs'][0]['schema']) == 3),
            'the column number of the confusion matrix output is not equal to three'
        )

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #12
0
ファイル: run_sample_test.py プロジェクト: zoyun/pipelines
class PySampleChecker(object):
    def __init__(self, testname, input, output, result, namespace='kubeflow'):
        """Util class for checking python sample test running results.

    :param testname: test name.
    :param input: The path of a pipeline file that will be submitted.
    :param output: The path of the test output.
    :param result: The path of the test result that will be exported.
    :param namespace: namespace of the deployed pipeline system. Default: kubeflow
    """
        self._testname = testname
        self._input = input
        self._output = output
        self._result = result
        self._namespace = namespace
        self._run_pipeline = None
        self._test_timeout = None

        self._test_cases = []
        self._test_name = self._testname + ' Sample Test'

        self._client = None
        self._experiment_id = None
        self._job_name = None
        self._test_args = None
        self._run_id = None

    def run(self):
        """Run compiled KFP pipeline."""

        ###### Initialization ######
        host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
        self._client = Client(host=host)

        ###### Check Input File ######
        utils.add_junit_test(self._test_cases, 'input generated yaml file',
                             os.path.exists(self._input),
                             'yaml file is not generated')
        if not os.path.exists(self._input):
            utils.write_junit_xml(self._test_name, self._result,
                                  self._test_cases)
            print('Error: job not found.')
            exit(1)

        ###### Create Experiment ######
        experiment_name = self._testname + ' sample experiment'
        response = self._client.create_experiment(experiment_name)
        self._experiment_id = response.id
        utils.add_junit_test(self._test_cases, 'create experiment', True)

        ###### Create Job ######
        self._job_name = self._testname + '_sample'
        ###### Figure out arguments from associated config files. #######
        self._test_args = {}
        config_schema = yamale.make_schema(SCHEMA_CONFIG)
        try:
            with open(DEFAULT_CONFIG, 'r') as f:
                raw_args = yaml.safe_load(f)
            default_config = yamale.make_data(DEFAULT_CONFIG)
            yamale.validate(
                config_schema,
                default_config)  # If fails, a ValueError will be raised.
        except yaml.YAMLError as yamlerr:
            raise RuntimeError('Illegal default config:{}'.format(yamlerr))
        except OSError as ose:
            raise FileExistsError('Default config not found:{}'.format(ose))
        else:
            self._test_timeout = raw_args['test_timeout']
            self._run_pipeline = raw_args['run_pipeline']

        try:
            with open(
                    os.path.join(CONFIG_DIR,
                                 '%s.config.yaml' % self._testname), 'r') as f:
                raw_args = yaml.safe_load(f)
            test_config = yamale.make_data(
                os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname))
            yamale.validate(
                config_schema,
                test_config)  # If fails, a ValueError will be raised.
        except yaml.YAMLError as yamlerr:
            print(
                'No legit yaml config file found, use default args:{}'.format(
                    yamlerr))
        except OSError as ose:
            print(
                'Config file with the same name not found, use default args:{}'
                .format(ose))
        else:
            self._test_args.update(raw_args['arguments'])
            if 'output' in self._test_args.keys(
            ):  # output is a special param that has to be specified dynamically.
                self._test_args['output'] = self._output
            if 'test_timeout' in raw_args.keys():
                self._test_timeout = raw_args['test_timeout']
            if 'run_pipeline' in raw_args.keys():
                self._run_pipeline = raw_args['run_pipeline']

        # Submit for pipeline running.
        if self._run_pipeline:
            response = self._client.run_pipeline(self._experiment_id,
                                                 self._job_name, self._input,
                                                 self._test_args)
            self._run_id = response.id
            utils.add_junit_test(self._test_cases, 'create pipeline run', True)

    def check(self):
        """Check pipeline run results."""
        if self._run_pipeline:
            ###### Monitor Job ######
            try:
                start_time = datetime.now()
                response = self._client.wait_for_run_completion(
                    self._run_id, self._test_timeout)
                succ = (response.run.status.lower() == 'succeeded')
                end_time = datetime.now()
                elapsed_time = (end_time - start_time).seconds
                utils.add_junit_test(self._test_cases, 'job completion', succ,
                                     'waiting for job completion failure',
                                     elapsed_time)
            finally:
                ###### Output Argo Log for Debugging ######
                workflow_json = self._client._get_workflow_json(self._run_id)
                workflow_id = workflow_json['metadata']['name']
                argo_log, _ = utils.run_bash_command(
                    'argo logs -n {} -w {}'.format(self._namespace,
                                                   workflow_id))
                print('=========Argo Workflow Log=========')
                print(argo_log)

            if not succ:
                utils.write_junit_xml(self._test_name, self._result,
                                      self._test_cases)
                exit(1)

            ###### Validate the results for specific test cases ######
            #TODO: Add result check for tfx-cab-classification after launch.
            if self._testname == 'xgboost_training_cm':
                # For xgboost sample, check its confusion matrix.
                cm_tar_path = './confusion_matrix.tar.gz'
                utils.get_artifact_in_minio(workflow_json, 'confusion-matrix',
                                            cm_tar_path,
                                            'mlpipeline-ui-metadata')
                with tarfile.open(cm_tar_path) as tar_handle:
                    file_handles = tar_handle.getmembers()
                    assert len(file_handles) == 1

                    with tar_handle.extractfile(file_handles[0]) as f:
                        cm_data = f.read()
                        utils.add_junit_test(
                            self._test_cases, 'confusion matrix format',
                            (len(cm_data) > 0),
                            'the confusion matrix file is empty')

        ###### Delete Job ######
        #TODO: add deletion when the backend API offers the interface.

        ###### Write out the test result in junit xml ######
        utils.write_junit_xml(self._test_name, self._result, self._test_cases)
コード例 #13
0
def main():
  args = parse_arguments()
  test_cases = []
  test_name = 'TFX Sample Test'

  ###### Initialization ######
  client = Client(namespace=args.namespace)

  ###### Check Input File ######
  utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated')
  if not os.path.exists(args.input):
    utils.write_junit_xml(test_name, args.result, test_cases)
    exit()

  ###### Create Experiment ######
  experiment_name = 'TFX sample experiment'
  response = client.create_experiment(experiment_name)
  experiment_id = response.id
  utils.add_junit_test(test_cases, 'create experiment', True)

  ###### Create Job ######
  job_name = 'TFX_sample'
  params = {'output': args.output,
            'project': 'ml-pipeline-test',
            'column-names': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json',
            'evaluation': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv',
            'train': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv',
            'hidden-layer-size': '5',
            'steps': '5'}
  response = client.run_pipeline(experiment_id, job_name, args.input, params)
  run_id = response.id
  utils.add_junit_test(test_cases, 'create pipeline run', True)


  ###### Monitor Job ######
  start_time = datetime.now()
  response = client.wait_for_run_completion(run_id, 1200)
  succ = (response.run.status.lower()=='succeeded')
  end_time = datetime.now()
  elapsed_time = (end_time - start_time).seconds
  utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time)
  if not succ:
    utils.write_junit_xml(test_name, args.result, test_cases)
    exit()

  ###### Output Argo Log for Debugging ######
  workflow_json = client._get_workflow_json(run_id)
  workflow_id = workflow_json['metadata']['name']
  argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(args.namespace, workflow_id))
  print("=========Argo Workflow Log=========")
  print(argo_log)

  ###### Validate the results ######
  #TODO: enable after launch
  #   model analysis html is validated
  # argo_workflow_id = workflow_json['metadata']['name']
  # gcs_html_path = os.path.join(os.path.join(args.output, str(argo_workflow_id)), 'analysis/output_display.html')
  # print('Output display HTML path is ' + gcs_html_path)
  # utils.run_bash_command('gsutil cp ' + gcs_html_path + './')
  # display_file = open('./output_display.html', 'r')
  # is_next_line_state = False
  # for line in display_file:
  #   if is_next_line_state:
  #     state = line.strip()
  #     break
  #   if line.strip() == '<script type="application/vnd.jupyter.widget-state+json">':
  #     is_next_line_state = True
  # import json
  # state_json = json.loads(state)
  # succ = ('state' in state_json and 'version_major' in state_json and 'version_minor' in state_json)
  # utils.add_junit_test(test_cases, 'output display html', succ, 'the state json does not contain state, version_major, or version_inor')

  ###### Delete Job ######
  #TODO: add deletion when the backend API offers the interface.

  ###### Write out the test result in junit xml ######
  utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #14
0
  def check(self):
    """Run sample test and check results."""
    test_cases = []
    test_name = self._testname + ' Sample Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(self._input), 'yaml file is not generated')
    if not os.path.exists(self._input):
      utils.write_junit_xml(test_name, self._result, test_cases)
      print('Error: job not found.')
      exit(1)

    ###### Create Experiment ######
    experiment_name = self._testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = self._testname + '_sample'
    ###### Figure out arguments from associated config files. #######
    test_args = {}
    try:
      with open(DEFAULT_CONFIG, 'r') as f:
        raw_args = yaml.safe_load(f)
    except yaml.YAMLError as yamlerr:
      raise RuntimeError('Illegal default config:{}'.format(yamlerr))
    except OSError as ose:
      raise FileExistsError('Default config not found:{}'.format(ose))
    else:
      test_timeout = raw_args['test_timeout']

    try:
      with open(os.path.join(CONFIG_DIR, '%s.config.yaml' % self._testname), 'r') as f:
          raw_args = yaml.safe_load(f)
    except yaml.YAMLError as yamlerr:
      print('No legit yaml config file found, use default args:{}'.format(yamlerr))
    except OSError as ose:
      print('Config file with the same name not found, use default args:{}'.format(ose))
    else:
      test_args.update(raw_args['arguments'])
      if 'output' in test_args.keys():  # output is a special param that has to be specified dynamically.
        test_args['output'] = self._output
      if 'test_timeout' in raw_args.keys():
        test_timeout = raw_args['test_timeout']

    response = client.run_pipeline(experiment_id, job_name, self._input, test_args)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    try:
      start_time = datetime.now()
      response = client.wait_for_run_completion(run_id, test_timeout)
      succ = (response.run.status.lower() == 'succeeded')
      end_time = datetime.now()
      elapsed_time = (end_time - start_time).seconds
      utils.add_junit_test(test_cases, 'job completion', succ,
                           'waiting for job completion failure', elapsed_time)
    finally:
      ###### Output Argo Log for Debugging ######
      workflow_json = client._get_workflow_json(run_id)
      workflow_id = workflow_json['metadata']['name']
      argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        self._namespace, workflow_id))
      print('=========Argo Workflow Log=========')
      print(argo_log)

    if not succ:
      utils.write_junit_xml(test_name, self._result, test_cases)
      exit(1)

    ###### Validate the results for specific test cases ######
    #TODO: Add result check for tfx-cab-classification after launch.
    if self._testname == 'xgboost_training_cm':
      # For xgboost sample, check its confusion matrix.
      cm_tar_path = './confusion_matrix.tar.gz'
      utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path,
                                  'mlpipeline-ui-metadata')
      with tarfile.open(cm_tar_path) as tar_handle:
        file_handles = tar_handle.getmembers()
        assert len(file_handles) == 1

        with tar_handle.extractfile(file_handles[0]) as f:
          cm_data = f.read()
          utils.add_junit_test(test_cases, 'confusion matrix format',
                               (len(cm_data) > 0),
                               'the confusion matrix file is empty')

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, self._result, test_cases)
コード例 #15
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'Resnet CMLE Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = 'resnet cmle sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'cmle_sample'
    params = {
        'output': args.output,
        'project_id': 'ml-pipeline-test',
        'region': 'us-central1',
        'model': 'bolts',
        'version': 'beta1',
        'tf_version':
        '1.9',  # Watch out! If 1.9 is no longer supported we need to set it to a newer version.
        'train_csv':
        'gs://ml-pipeline-dataset/sample-test/bolts/bolt_images_train_sample1000.csv',
        'validation_csv':
        'gs://ml-pipeline-dataset/sample-test/bolts/bolt_images_validate_sample200.csv',
        'labels': 'gs://bolts_image_dataset/labels.txt',
        'depth': 50,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'steps_per_eval': 128,
        'train_steps': 128,
        'num_train_images': 1000,
        'num_eval_images': 200,
        'num_label_classes': 10
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    try:
        start_time = datetime.now()
        response = client.wait_for_run_completion(run_id, 1800)
        succ = (response.run.status.lower() == 'succeeded')
        end_time = datetime.now()
        elapsed_time = (end_time - start_time).seconds
        utils.add_junit_test(test_cases, 'job completion', succ,
                             'waiting for job completion failure',
                             elapsed_time)
    finally:
        ###### Output Argo Log for Debugging ######
        workflow_json = client._get_workflow_json(run_id)
        workflow_id = workflow_json['metadata']['name']
        argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
            args.namespace, workflow_id))
        print("=========Argo Workflow Log=========")
        print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #16
0
def wait_for_pipeline_completion(run_id,
                                 timeout=60 * 60,
                                 expected_statuses: typing.List[str] = None,
                                 namespace=None):
    """Wait for Pipeline status, timeout in sec

    :param run_id:     id of pipelines run
    :param timeout:    wait timeout in sec
    :param expected_statuses:  list of expected statuses, otherwise will raise
                               succeeded | failed | skipped | error | running
    :param namespace:  k8s namespace if not default

    :return kfp run dict
    """
    namespace = namespace or mlconf.namespace
    remote = not get_k8s_helper(
        init=False).is_running_inside_kubernetes_cluster()
    logger.debug(f"Waiting for run completion."
                 f" run_id: {run_id},"
                 f" expected_statuses: {expected_statuses},"
                 f" timeout: {timeout},"
                 f" remote: {remote},"
                 f" namespace: {namespace}")

    if remote:
        mldb = get_run_db().connect()

        def get_pipeline_if_completed(run_id, namespace=namespace):
            resp = mldb.get_pipeline(run_id, namespace=namespace)
            status = resp['run']['status']
            if status.lower() not in RunStatuses.stable_statuses():

                # TODO: think of nicer liveness indication and make it re-usable
                # log '.' each retry as a liveness indication
                logger.debug('.')
                raise RuntimeError('pipeline run has not completed yet')

            return resp

        if mldb.kind != 'http':
            raise ValueError(
                'get pipeline require access to remote api-service'
                ', please set the dbpath url')

        resp = retry_until_successful(10,
                                      timeout,
                                      logger,
                                      False,
                                      get_pipeline_if_completed,
                                      run_id,
                                      namespace=namespace)
    else:
        client = Client(namespace=namespace)
        resp = client.wait_for_run_completion(run_id, timeout)
        if resp:
            resp = resp.to_dict()

    status = resp['run']['status'] if resp else 'unknown'
    if expected_statuses:
        if status not in expected_statuses:
            raise RuntimeError(f"run status {status} not in expected statuses")

    logger.debug(f"Finished waiting for pipeline completion."
                 f" run_id: {run_id},"
                 f" status: {status},"
                 f" namespace: {namespace}")

    return resp
コード例 #17
0
ファイル: run_sample_test.py プロジェクト: striderw/pipelines
def main():
    args = parse_arguments()
    test_cases = []
    test_name = args.testname + ' Sample Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = args.testname + ' sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = args.testname + '_sample'
    ###### Test-specific arguments #######
    if args.testname == 'tfx_cab_classification':
        params = {
            'output': args.output,
            'project': 'ml-pipeline-test',
            'column-names':
            'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json',
            'evaluation':
            'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv',
            'train':
            'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv',
            'hidden-layer-size': '5',
            'steps': '5'
        }
    elif args.testname == 'xgboost_training_cm':
        params = {
            'output': args.output,
            'project': 'ml-pipeline-test',
            'train-data':
            'gs://ml-pipeline-dataset/sample-test/sfpd/train_50.csv',
            'eval-data':
            'gs://ml-pipeline-dataset/sample-test/sfpd/eval_20.csv',
            'schema': 'gs://ml-pipeline-dataset/sample-test/sfpd/schema.json',
            'rounds': '20',
            'workers': '2'
        }
    else:
        # Basic tests require no additional params.
        params = {}

    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    try:
        start_time = datetime.now()
        if args.testname == 'xgboost_training_cm':
            response = client.wait_for_run_completion(run_id, 1800)
        else:
            response = client.wait_for_run_completion(run_id, 1200)
        succ = (response.run.status.lower() == 'succeeded')
        end_time = datetime.now()
        elapsed_time = (end_time - start_time).seconds
        utils.add_junit_test(test_cases, 'job completion', succ,
                             'waiting for job completion failure',
                             elapsed_time)
    finally:
        ###### Output Argo Log for Debugging ######
        workflow_json = client._get_workflow_json(run_id)
        workflow_id = workflow_json['metadata']['name']
        argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
            args.namespace, workflow_id))
        print('=========Argo Workflow Log=========')
        print(argo_log)

    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Validate the results for specific test cases ######
    #TODO: Add result check for tfx-cab-classification after launch.
    if args.testname == 'xgboost_training_cm':
        cm_tar_path = './confusion_matrix.tar.gz'
        utils.get_artifact_in_minio(workflow_json, 'confusion-matrix',
                                    cm_tar_path, 'mlpipeline-ui-metadata')
        with tarfile.open(cm_tar_path) as tar_handle:
            file_handles = tar_handle.getmembers()
            assert len(file_handles) == 1

            with tar_handle.extractfile(file_handles[0]) as f:
                cm_data = f.read()
                utils.add_junit_test(test_cases, 'confusion matrix format',
                                     (len(cm_data) > 0),
                                     'the confusion matrix file is empty')

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #18
0
def main():
    args = parse_arguments()
    test_cases = []
    test_name = 'XGBoost Sample Test'

    ###### Initialization ######
    host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace
    client = Client(host=host)

    ###### Check Input File ######
    utils.add_junit_test(test_cases, 'input generated yaml file',
                         os.path.exists(args.input),
                         'yaml file is not generated')
    if not os.path.exists(args.input):
        utils.write_junit_xml(test_name, args.result, test_cases)
        print('Error: job not found.')
        exit(1)

    ###### Create Experiment ######
    experiment_name = 'xgboost sample experiment'
    response = client.create_experiment(experiment_name)
    experiment_id = response.id
    utils.add_junit_test(test_cases, 'create experiment', True)

    ###### Create Job ######
    job_name = 'xgboost_sample'
    params = {
        'output': args.output,
        'project': 'ml-pipeline-test',
        'train-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/train_50.csv',
        'eval-data': 'gs://ml-pipeline-dataset/sample-test/sfpd/eval_20.csv',
        'schema': 'gs://ml-pipeline-dataset/sample-test/sfpd/schema.json',
        'rounds': '20',
        'workers': '2'
    }
    response = client.run_pipeline(experiment_id, job_name, args.input, params)
    run_id = response.id
    utils.add_junit_test(test_cases, 'create pipeline run', True)

    ###### Monitor Job ######
    start_time = datetime.now()
    response = client.wait_for_run_completion(run_id, 1800)
    succ = (response.run.status.lower() == 'succeeded')
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).seconds
    utils.add_junit_test(test_cases, 'job completion', succ,
                         'waiting for job completion failure', elapsed_time)

    ###### Output Argo Log for Debugging ######
    workflow_json = client._get_workflow_json(run_id)
    workflow_id = workflow_json['metadata']['name']
    argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(
        args.namespace, workflow_id))
    print("=========Argo Workflow Log=========")
    print(argo_log)

    ###### If the job fails, skip the result validation ######
    if not succ:
        utils.write_junit_xml(test_name, args.result, test_cases)
        exit(1)

    ###### Validate the results ######
    #   confusion matrix should show three columns for the flower data
    #     target, predicted, count
    cm_tar_path = './confusion_matrix.tar.gz'
    cm_filename = 'mlpipeline-ui-metadata.json'
    utils.get_artifact_in_minio(workflow_json, 'confusion-matrix', cm_tar_path)
    tar_handler = tarfile.open(cm_tar_path)
    tar_handler.extractall()

    with open(cm_filename, 'r') as f:
        cm_data = f.read()
        utils.add_junit_test(test_cases, 'confusion matrix format',
                             (len(cm_data) > 0),
                             'the confusion matrix file is empty')

    ###### Delete Job ######
    #TODO: add deletion when the backend API offers the interface.

    ###### Write out the test result in junit xml ######
    utils.write_junit_xml(test_name, args.result, test_cases)
コード例 #19
0
def wait_for_pipeline_completion(
    run_id,
    timeout=60 * 60,
    expected_statuses: List[str] = None,
    namespace=None,
    remote=True,
    project: str = None,
):
    """Wait for Pipeline status, timeout in sec

    :param run_id:     id of pipelines run
    :param timeout:    wait timeout in sec
    :param expected_statuses:  list of expected statuses, one of [ Succeeded | Failed | Skipped | Error ], by default
                               [ Succeeded ]
    :param namespace:  k8s namespace if not default
    :param remote:     read kfp data from mlrun service (default=True)
    :param project:    the project of the pipeline

    :return: kfp run dict
    """
    if expected_statuses is None:
        expected_statuses = [RunStatuses.succeeded]
    namespace = namespace or mlconf.namespace
    logger.debug(f"Waiting for run completion."
                 f" run_id: {run_id},"
                 f" project: {project},"
                 f" expected_statuses: {expected_statuses},"
                 f" timeout: {timeout},"
                 f" remote: {remote},"
                 f" namespace: {namespace}")

    if remote:
        mldb = get_run_db()

        def get_pipeline_if_completed(run_id, namespace=namespace):
            resp = mldb.get_pipeline(run_id,
                                     namespace=namespace,
                                     project=project)
            status = resp["run"]["status"]
            show_kfp_run(resp, clear_output=True)
            if status not in RunStatuses.stable_statuses():

                # TODO: think of nicer liveness indication and make it re-usable
                # log '.' each retry as a liveness indication
                logger.debug(".")
                raise RuntimeError("pipeline run has not completed yet")

            return resp

        if mldb.kind != "http":
            raise ValueError(
                "get pipeline require access to remote api-service"
                ", please set the dbpath url")

        resp = retry_until_successful(
            10,
            timeout,
            logger,
            False,
            get_pipeline_if_completed,
            run_id,
            namespace=namespace,
        )
    else:
        client = Client(namespace=namespace)
        resp = client.wait_for_run_completion(run_id, timeout)
        if resp:
            resp = resp.to_dict()
            resp = format_summary_from_kfp_run(resp)
        show_kfp_run(resp)

    status = resp["run"]["status"] if resp else "unknown"
    message = resp["run"].get("message", "")
    if expected_statuses:
        if status not in expected_statuses:
            raise RuntimeError(
                f"Pipeline run status {status}{', ' + message if message else ''}"
            )

    logger.debug(f"Finished waiting for pipeline completion."
                 f" run_id: {run_id},"
                 f" status: {status},"
                 f" message: {message},"
                 f" namespace: {namespace}")

    return resp