def verify_result_contract(result_object: MLObject, expected_schema_type,
                           expected_schema_version, step_name: str):
    """ Creates an MLObject based on an input string, and validates it against the workflow object
    and step_name provided.

    Will fail if the .validate() fails on the object or the schema mismatches what is seen in the
    workflow.
    """
    rootLogger = logging.getLogger()

    (contract_object, errors) = MLObject.create_object_from_string(
        result_object.dict_without_internal_variables())

    if errors is not None and len(errors) > 0:
        error_string = f"Error verifying result object for '{step_name}.output': {errors}"
        rootLogger.debug(error_string)
        raise ValueError(error_string)

    if (contract_object.schema_type != expected_schema_type) or (
            contract_object.schema_version != expected_schema_version):
        error_string = f"""Actual data does not match the expected schema and version:
    Expected Type: {expected_schema_type}
    Actual Type: {contract_object.schema_type}

    Expected Version: {expected_schema_version}
    Actual Version: {contract_object.schema_version}")"""
        rootLogger.debug(error_string)
        raise ValueError(error_string)

    rootLogger.debug(
        f"Successfully loaded and validated contract object: {contract_object.schema_type} on step {step_name}.output"
    )

    return True
    def test_package(self):
        step_name = "train"
        expected_results_schema_type = "package_results"  # MUST BE A LOADED SCHEMA
        expected_results_schema_version = "9999.0.1"  # MUST BE A SEMVER

        step_execution_object = StepExecution(self.input_parameters,
                                              self.execution_parameters)

        results_ml_object = MLObject()
        results_ml_object.set_type(
            schema_type=expected_results_schema_type,
            schema_version=expected_results_schema_version,
        )

        # Should error due to missing fields
        with self.assertRaises(ValueError) as context:
            verify_result_contract(
                results_ml_object,
                expected_results_schema_type,
                expected_results_schema_version,
                step_name,
            )

        self.assertTrue(
            f"Error verifying result object for '{step_name}.output'" in str(
                context.exception))

        result_ml_object_schema_type = expected_results_schema_type
        result_ml_object_schema_version = expected_results_schema_version

        exec(
            (Path("tests") / "sample_package_execution.py").read_text(),
            globals(),
            locals(),
        )

        results_ml_object.run_date = datetime.datetime.now()
        results_ml_object.step_id = uuid.uuid4()
        results_ml_object.run_id = uuid.uuid4()

        results_ml_object.execution_profile.system_memory_utilization = random(
        )
        results_ml_object.execution_profile.network_traffic_in_bytes = randint(
            7e9, 9e10)
        results_ml_object.execution_profile.gpu_temperature = randint(70, 130)
        results_ml_object.execution_profile.disk_io_utilization = random()
        results_ml_object.execution_profile.gpu_percent_of_time_accessing_memory = (
            random())
        results_ml_object.execution_profile.cpu_utilization = random()
        results_ml_object.execution_profile.gpu_utilization = random()
        results_ml_object.execution_profile.gpu_memory_allocation = random()

        self.assertTrue(
            verify_result_contract(
                results_ml_object,
                expected_results_schema_type,
                expected_results_schema_version,
                step_name,
            ))
Beispiel #3
0
    def __init__(self, path: str):
        path = os.path.abspath(path)
        metadata, err = MLObject.create_object_from_file(path)
        if err:
            raise RuntimeError(err)

        self.task_name: str = metadata['task_name']
        # Input/output bindings map parameter name to parameter value
        self.input_binding: dict = metadata['input_binding']
        self.output_binding: dict = metadata['output_binding']
Beispiel #4
0
 def __init__(self, file_path: str):
     """
     Args:
         file_path (str): Path to a  'mlbox_docker.yaml' that is usually located in the MLBox root directory.
     """
     metadata, err = MLObject.create_object_from_file(file_path)
     if err:
         raise RuntimeError(err)
     self.type = RuntimeType.Docker
     self.image = metadata['image']             # Docker image name
     self.docker = metadata['docker_runtime']   # Docker executable ('docker' or 'nvidia-docker').
Beispiel #5
0
    def __init__(self, path: str):
        """
        Args:
            path (str): Path to a MLBox task file.
        """
        path = os.path.abspath(path)
        metadata, err = MLObject.create_object_from_file(path)
        if err:
            raise RuntimeError(err)

        self.inputs = {input_['name']: input_['type'] for input_ in metadata.get('inputs', [])}
        self.outputs = {output['name']: output['type'] for output in metadata.get('outputs', [])}
Beispiel #6
0
 def __init__(self, path: str, **kwargs: Any):
     """
     Args:
         path (str): Path to a Singularity platform that is usually located in the MLBox `platforms` directory.
         **kwargs (Any): Reserved for future use to unify implementation of Platform Definition classes across
             various runners.
     """
     metadata, err = MLObject.create_object_from_file(path)
     if err:
         raise RuntimeError(err)
     self.type: str = 'singularity'
     self.image: str = metadata['image']
    def test_e2e(self):
        MLSchema.populate_registry()
        MLSchema.append_schema_to_registry(Path.cwd() / ".parameters" /
                                           "schemas")

        # Execute step
        input_parameters = {
            # Put sample required input parameters here
        }

        execution_parameters = {
            # Put sample required execution parameters here
        }

        # THESE SHOULD BE THE ONLY SETTINGS FOR THIS FILE
        step_name = "process_data"
        expected_results_schema_type = "data_result"  # MUST BE A LOADED SCHEMA
        expected_results_schema_version = "0.0.1"  # MUST BE A SEMVER

        step_execution_object = StepExecution(input_parameters,
                                              execution_parameters)

        results_object = MLObject()
        results_object.set_type(
            schema_type=expected_results_schema_type,
            schema_version=expected_results_schema_version,
        )

        # Should error due to missing fields
        with self.assertRaises(ValueError) as context:
            verify_result_contract(
                results_object,
                expected_results_schema_type,
                expected_results_schema_version,
                step_name,
            )

        self.assertTrue(
            f"Error verifying result object for '{step_name}.output'" in str(
                context.exception))

        results_object = step_execution_object.execute(
            result_object_schema_type=expected_results_schema_type,
            result_object_schema_version=expected_results_schema_version,
        )

        results_object.run_date = datetime.datetime.now()
        results_object.step_id = uuid.uuid4()
        results_object.run_id = uuid.uuid4()

        self.assertTrue(
            verify_result_contract(results_object,
                                   expected_results_schema_type,
                                   expected_results_schema_version, step_name))
Beispiel #8
0
    def __init__(self, path: str):
        """
        Args:
            path (str): Path to a MLBox root directory.
        """
        path = os.path.abspath(path)
        metadata, err = MLObject.create_object_from_file(os.path.join(path, 'mlbox.yaml'))
        if err:
            raise RuntimeError(err)

        self.root = path
        self.name = metadata['name']
        self.version = metadata['version']
        self.runtime = DockerRuntime.load(path)
    def test_load_workflow_object(self):
        with patch.object(
            mlspeclib.experimental.metastore, "Metastore"
        ) as mock_metastore:
            mock_metastore.get_workflow_object.return_value = (None, None)

            with self.assertRaises(KnownException) as context:
                load_workflow_object("0.0.1", mock_metastore)

            self.assertTrue("load workflow" in str(context.exception))

            workflow_object = MLObject()
            mock_metastore.get_workflow_object.return_value = (workflow_object, None)
            with self.assertRaises(KnownException) as context:
                load_workflow_object("0.0.1", mock_metastore)
            self.assertTrue("field 'steps'" in str(context.exception))

            workflow_object = MLObject()
            workflow_object.steps = {}
            mock_metastore.get_workflow_object.return_value = (workflow_object, None)
            return_object = load_workflow_object("0.0.1", mock_metastore)

            self.assertTrue(isinstance(return_object, MLObject))
Beispiel #10
0
    def __init__(self, path: str):
        """
        Args:
            path (str): Path to a MLBox root directory.
        """
        path = os.path.abspath(path)
        metadata, err = MLObject.create_object_from_file(os.path.join(path, 'mlbox.yaml'))
        if err:
            raise RuntimeError(err)

        self.root = path
        self.name = metadata['name']
        self.version = metadata['version']
        self.task: Optional[MLBoxTask] = None
        self.invoke: Optional[MLBoxInvoke] = None
        self.platform: Any = None
Beispiel #11
0
def mlobject_from_dict(schema_type, schema_version, dict_value):
    ml_object = MLObject()
    ml_object.set_type(
        schema_version=schema_version,
        schema_type=schema_type,
    )
    dict_value['schema_type'] = schema_type
    dict_value['schema_version'] = schema_version
    MLObject.update_tree(ml_object, dict_value)
    errors = ml_object.validate()

    if errors:
        return None, errors
    else:
        return ml_object, None
Beispiel #12
0
class StepExecution:
    input_params = {}  # noqa
    execution_params = {}  # noqa
    ml_object = MLObject()  # noqa
    logger = None  # noqa

    def __init__(self, input_params, execution_params):
        self.input_params = input_params
        self.execution_params = execution_params
        self.logger = setupLogger().get_root_logger()

        # Execute all work in here.

        # Output input params & execution params
        if self.input_params is not None:
            self.logger.debug(f"Input params: {self.input_params}")

        if self.execution_params is not None:
            self.logger.debug(f"Execution params: {self.execution_params}")
        else:
            raise KnownException("No execution parameters provided.")

    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = {
            "data_output_path": str(Path("tests/data/data_output.csv")),
            "data_statistics_path": str(Path("tests/data/data_stats.csv")),
            "data_schemas_path": str(Path("tests/data/data_schemas.yaml")),
            "feature_file_path": str(Path("tests/data/feature_file.yaml")),
        }

        results_object.data_output_path = return_dict["data_output_path"]
        results_object.data_statistics_path = return_dict[
            "data_statistics_path"]
        results_object.data_schemas_path = return_dict["data_schemas_path"]
        results_object.feature_file_path = return_dict["feature_file_path"]

        _ = results_object.validate()  # noqa
        return results_object
class StepExecution:
    input_params = {}  # noqa
    execution_params = {}  # noqa
    ml_object = MLObject()  # noqa
    logger = None  # noqa

    def __init__(self, input_params, execution_params):
        self.input_params = input_params
        self.execution_params = execution_params
        self.logger = logging.getLogger()

        # Execute all work in here.

        # Output input params & execution params
        if self.input_params is not None:
            self.logger.debug(f"Input params: {self.input_params}")

        if self.execution_params is not None:
            self.logger.debug(f"Execution params: {self.execution_params}")

    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = {
            "training_execution_id": str(uuid.uuid4()),
            "accuracy": float(random.randrange(0, 100) / 100),
            "global_step": 10**random.randrange(2, 4),
            "loss": float(random.randrange(1000, 9999) / 1000),
        }

        results_object.training_execution_id = return_dict[
            "training_execution_id"]
        results_object.accuracy = return_dict["accuracy"]
        results_object.global_step = return_dict["global_step"]
        results_object.loss = return_dict["loss"]

        errors = results_object.validate()  # noqa
        return results_object
Beispiel #14
0
    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = {
            "data_output_path": str(Path("tests/data/data_output.csv")),
            "data_statistics_path": str(Path("tests/data/data_stats.csv")),
            "data_schemas_path": str(Path("tests/data/data_schemas.yaml")),
            "feature_file_path": str(Path("tests/data/feature_file.yaml")),
        }

        results_object.data_output_path = return_dict["data_output_path"]
        results_object.data_statistics_path = return_dict[
            "data_statistics_path"]
        results_object.data_schemas_path = return_dict["data_schemas_path"]
        results_object.feature_file_path = return_dict["feature_file_path"]

        _ = results_object.validate()  # noqa
        return results_object
    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = {
            "training_execution_id": str(uuid.uuid4()),
            "accuracy": float(random.randrange(0, 100) / 100),
            "global_step": 10**random.randrange(2, 4),
            "loss": float(random.randrange(1000, 9999) / 1000),
        }

        results_object.training_execution_id = return_dict[
            "training_execution_id"]
        results_object.accuracy = return_dict["accuracy"]
        results_object.global_step = return_dict["global_step"]
        results_object.loss = return_dict["loss"]

        errors = results_object.validate()  # noqa
        return results_object
class StepExecution:
    input_params = {}  # noqa
    execution_params = {}  # noqa
    ml_object = MLObject()  # noqa
    rootLogger = None  # noqa

    def __init__(self, input_params, execution_params):
        self.input_params = input_params
        self.execution_params = execution_params
        self.rootLogger = setupLogger().get_root_logger()

        # Execute all work in here.

        # Output input params & execution params
        if self.input_params is not None:
            self.rootLogger.debug(f"Input params: {self.input_params}")

        if self.execution_params is not None:
            self.rootLogger.debug(f"Execution params: {self.execution_params}")

    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = {
            "training_execution_id": uuid.uuid4(),
            "accuracy": float(f"{randrange(93000,99999)/100000}"),
            "global_step": int(f"{randrange(50,150) * 100}"),
            "loss": float(f"{randrange(10000,99999)/1000000}")
        }

        results_object.training_execution_id = return_dict[
            "training_execution_id"]
        results_object.accuracy = return_dict["accuracy"]
        results_object.global_step = return_dict["global_step"]
        results_object.loss = return_dict["loss"]

        return results_object
    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = {
            "training_execution_id": uuid.uuid4(),
            "accuracy": float(f"{randrange(93000,99999)/100000}"),
            "global_step": int(f"{randrange(50,150) * 100}"),
            "loss": float(f"{randrange(10000,99999)/1000000}")
        }

        results_object.training_execution_id = return_dict[
            "training_execution_id"]
        results_object.accuracy = return_dict["accuracy"]
        results_object.global_step = return_dict["global_step"]
        results_object.loss = return_dict["loss"]

        return results_object
Beispiel #18
0
def parse_mlcube_task(filename):
    (task, err) = MLObject.create_object_from_file(filename)
    if err:
      return None, err

    inputs = {}
    for input_dict in task.inputs:
      input_obj, err = mlobject_from_dict('mlcube_task_input', '1.0.0', input_dict)
      if err:
        return None, err
      inputs[input_obj.name] = input_obj

    outputs = {}
    for output_dict in task.outputs:
      output_obj, err = mlobject_from_dict('mlcube_task_output', '1.0.0', output_dict)
      if err:
        return None, err
      outputs[output_obj.name] = output_obj

    task.inputs = inputs
    task.outputs = outputs
    return task, None
    def execute(self, result_object_schema_type, result_object_schema_version):
        # Create Result object
        results_object = MLObject()
        results_object.set_type(
            schema_type=result_object_schema_type,
            schema_version=result_object_schema_version,
        )

        # Mocked up results
        return_dict = YAML.safe_load(
            """
servable: True
package_size: 1029310298
tested_platforms: ['kubeflow', 'azureml', 'sagemaker']
model_source:
    servable_model:
        data_store: 's3'
        bucket: 'nlp-bucket'
        path: 'a231653454ca8e07f42adc7941aeec6b'
serving_container_image:
    container_image_url: 'https://hub.docker.com/repository/docker/contoso/nlp-base-images'
"""
        )

        results_object.servable = return_dict["servable"]
        results_object.tested_platforms = return_dict["tested_platforms"]
        results_object.package_size = return_dict["package_size"]
        results_object.model_source.servable_model.data_store = return_dict[
            "model_source"
        ]["servable_model"]["data_store"]
        results_object.model_source.servable_model.bucket = return_dict["model_source"][
            "servable_model"
        ]["bucket"]
        results_object.model_source.servable_model.path = return_dict["model_source"][
            "servable_model"
        ]["path"]
        results_object.serving_container_image.container_image_url = return_dict[
            "serving_container_image"
        ]["container_image_url"]

        return results_object
def main():
    MLSchema.append_schema_to_registry(Path(".parameters") / "schemas")

    workflow_version = "2.0.0"
    workflow_path = (Path(".parameters") / "workflow_2_3_0.yaml")
    workflow_input_string = YAML.safe_dump(workflow_path.read_text())
    workflow_dict = YAML.safe_load(YAML.safe_load(workflow_input_string))
    workflow_dict["workflow_version"] = workflow_version
    workflow_dict["run_id"] = str(uuid.uuid4())
    workflow_dict["step_id"] = str(uuid.uuid4())
    workflow_dict["run_date"] = datetime.datetime.now()

    workflow_string = YAML.safe_dump(workflow_dict)
    (workflow_object,
     errors) = MLObject.create_object_from_string(workflow_string)

    credentials_packed = Credentials.metastore_credentials_prod
    ms = Metastore(credentials_packed)

    workflow_node_id = ms.create_workflow_node(workflow_object,
                                               workflow_dict["run_id"])
    ms.create_workflow_steps(workflow_node_id, workflow_object)
    print(
        f"Success {str(workflow_path)} - workflow_node_id: {workflow_node_id}")
Beispiel #21
0
from mlspeclib import MLObject, MLSchema
from pathlib import Path
import pprint

MLSchema.populate_registry()
MLSchema.append_schema_to_registry(Path("schemas"))

(sample_instantiated_object,
 err) = MLObject.create_object_from_file('sample_instantiated_schema.yaml')

pp = pprint.PrettyPrinter(indent=4)
if err != {}:
    pp.pprint(err)
else:
    pp.pprint(sample_instantiated_object.tasks)

(sample_task_object,
 err) = MLObject.create_object_from_file('tasks/download_data.yaml')

if err != {}:
    pp.pprint(err)
else:
    pp.pprint(sample_task_object.inputs)
    pp.pprint(sample_task_object.outputs)

load_path = Path('tasks').glob("*.yaml")
load_list = list(load_path)

for this_file in load_list:
    file_text = this_file.read_text()
    (loaded_object, err) = MLObject.create_object_from_string(file_text)
Beispiel #22
0
def parse_mlbox_docker(filename):
    (docker, err) = MLObject.create_object_from_file(filename)
    return docker, err
Beispiel #23
0
def load_contract_object(
    parameters: dict, workflow_object: MLObject, step_name: str, contract_type: str
):
    """ Creates an MLObject based on an input string, and validates it against the workflow object
    and step_name provided.

    Will fail if the .validate() fails on the object or the schema mismatches what is seen in the
    workflow.
    """
    rootLogger = setupLogger().get_root_logger()

    if contract_type not in CONTRACT_TYPES:
        raise KnownException(
            f"{contract_type} not in the expected list of contract types: {CONTRACT_TYPES}."
        )

    if isinstance(parameters, dict):
        parameters_string = YAML.safe_dump(parameters)
    elif isinstance(parameters, str):
        parameters_string = parameters
    else:
        raise KnownException(
            f"load_contract_object was called with neither a string nor a dict. Value: {parameters}"
        )

    (contract_object, errors) = MLObject.create_object_from_string(parameters_string)

    if errors is not None and len(errors) > 0:
        rootLogger.debug(f"{contract_type} object loading errors: {errors}")
        raise KnownException(
            f"Error when trying to validate the contract object {step_name}.{contract_type}. Errors: {errors}"
        )

    if step_name not in workflow_object["steps"]:
        raise KnownException(
            f"Workflow object does not contain the step '{step_name}'."
        )

    if contract_type not in workflow_object["steps"][step_name]:
        raise KnownException(
            f"Workflow object for step '{step_name}' does not contain a spec for the contract type: '{contract_type}'."
        )

    if (
        contract_object.schema_type
        != workflow_object["steps"][step_name][contract_type].schema_type
    ) or (
        contract_object.schema_version
        != workflow_object["steps"][step_name][contract_type].schema_version
    ):
        raise_schema_mismatch(
            expected_type=workflow_object["steps"][step_name][
                contract_type
            ].schema_type,
            actual_type=contract_object.schema_type,
            expected_version=workflow_object["steps"][step_name][
                contract_type
            ].schema_version,
            actual_version=contract_object.schema_version,
        )
    rootLogger.debug(
        f"Successfully loaded and validated contract object: {contract_object.schema_type} on step {step_name}.{contract_type}"
    )
    return contract_object
Beispiel #24
0
def parse_mlbox_root(filename):
    (root, err) = MLObject.create_object_from_file(filename)
    return root, err
Beispiel #25
0
def parse_mlbox_invoke(filename):
    if not os.path.exists(filename):
        return None, 'No such invocation file: {}'.format(filename)
    (root, err) = MLObject.create_object_from_file(filename)
    return root, err
Beispiel #26
0
def execute_step(
    execution_file: str,
    workflow_object: MLObject,
    input_object: MLObject,
    execution_object: MLObject,
    step_name,
    run_id,
):

    rootLogger = setupLogger().get_root_logger()

    results_ml_object = MLObject()

    if execution_file is None:
        msg = "Did not find any value for INPUT_EXECUTION_FILE, using /src/step_execution.py"

        print_left_message(msg)
        rootLogger.debug("::debug::" + msg)

        print("{:>15}".format("ok"))  # Finished loading from environment

        step_execution_object = StepExecution(input_object, execution_object)
        results_ml_object = step_execution_object.execute(
            result_object_schema_type=workflow_object.steps[
                step_name
            ].output.schema_type,
            result_object_schema_version=workflow_object.steps[
                step_name
            ].output.schema_version,
        )

    else:
        # TODO: Critical error if variable set but file not found
        msg = f"Executing '${execution_file}' (found in INPUT_EXECUTION_FILE env var)"

        print_left_message(msg)
        rootLogger.debug("::debug::" + msg)

        execution_file_path = Path(execution_file)

        if execution_file_path.exists() is False:
            raise KnownException(
                f"'{execution_file}' was provided as the file, but it does not appear to exist at {str(execution_file_path.resolve())} -- exiting."
            )

        # The below are used in the execution file
        result_ml_object_schema_type = workflow_object.steps[  # noqa
            step_name
        ].output.schema_type
        result_ml_object_schema_version = workflow_object.steps[  # noqa
            step_name
        ].output.schema_version
        exec(execution_file_path.read_text(), globals(), locals())

        print("{:>15}".format("ok"))  # Finished executing step

    if (results_ml_object is None) or (len(results_ml_object) == 0):
        raise KnownException(
            "No value was assigned to the variable 'results_ml_object' -- exiting."
        )
    elif isinstance(results_ml_object, MLObject) is False:
        raise KnownException(
            "The variable 'results_ml_object' was not of type MLObject -- exiting."
        )

    results_ml_object.run_id = run_id
    results_ml_object.step_id = str(uuid.uuid4())
    results_ml_object.run_date = datetime.datetime.now().isoformat()

    # Using the below to validate the object, even though we already have it created.
    load_contract_object(
        parameters=results_ml_object.dict_without_internal_variables(),
        workflow_object=workflow_object,
        step_name=step_name,
        contract_type="output",
    )

    return results_ml_object
Beispiel #27
0
def sub_main():
    rootLogger = setupLogger().get_root_logger()

    # Loading input values
    msg = "::debug::Loading input values"
    print_left_message("Loading variables from environment...")
    rootLogger.debug(msg)

    parameters = convert_environment_variables_to_dict()

    print("{:>15}".format("ok"))  # Finished loading from environment

    parameters.INPUT_SCHEMAS_DIRECTORY = os.environ.get("INPUT_SCHEMAS_DIRECTORY")

    if "INPUT_SCHEMAS_GIT_URL" in os.environ and os.environ.get != "":
        parameters.INPUT_SCHEMAS_GIT_URL = os.environ.get("INPUT_SCHEMAS_GIT_URL")
        print_left_message(
            f"Downloading schemas from {parameters.INPUT_SCHEMAS_GIT_URL}..."
        )
        try:
            git.Git(parameters.INPUT_SCHEMAS_DIRECTORY).clone(
                parameters.INPUT_SCHEMAS_GIT_URL, str(uuid.uuid4()), depth=1
            )
            # TODO: Authenticate with GH Token?
            print("{:>15}".format("ok"))  # Finished loading from GIT URL
        except GitCommandError as gce:
            raise KnownException(
                f"Trying to read from the git repo ({parameters.INPUT_SCHEMAS_GIT_URL}) and write to the directory ({parameters.INPUT_SCHEMAS_DIRECTORY}). Full error follows: {str(gce)}"
            )

    print_left_message("Appending schemas to registry...")
    MLSchema.append_schema_to_registry(Path(parameters.INPUT_SCHEMAS_DIRECTORY))
    print("{:>15}".format("ok"))  # Finished loading registry

    parameters.previous_step_name = os.environ.get("INPUT_PREVIOUS_STEP_NAME", "")
    parameters.next_step_name = os.environ.get("INPUT_NEXT_STEP_NAME", "")
    rootLogger.debug("::debug:: Finished main")

    # Load metastore credentials

    rootLogger.debug("::debug:: Loading credentials")
    print_left_message("Loading and validating metastore credentials...")
    metastore_cred_string_blob = os.environ.get("INPUT_METASTORE_CREDENTIALS")

    metastore_credentials_packed = YAML.safe_load(metastore_cred_string_blob)
    metastore_credentials_string = base64.urlsafe_b64decode(
        metastore_credentials_packed
    ).decode("utf-8")
    metastore_credentials = YAML.safe_load(metastore_credentials_string)

    report_found_params(
        ["url", "key", "database_name", "container_name"], metastore_credentials
    )
    print("{:>15}".format("ok"))  # Finished loading and validating metastore
    rootLogger.debug("::debug::Starting metastore connection")

    print_left_message("Starting connection to metastore...")
    ms = load_metastore_connection(metastore_credentials_packed)
    print("{:>15}".format("ok"))  # Finished connecting to metastore

    workflow_node_id = os.environ.get("INPUT_WORKFLOW_NODE_ID")
    if workflow_node_id == "":
        raise KnownException(
            "INPUT_WORKFLOW_NODE_ID - No workflow node id was provided."
        )

    print_left_message(f"Loading workflow object ID: '{workflow_node_id}' ...")
    workflow_object = load_workflow_object(workflow_node_id, ms)
    print("{:>15}".format("ok"))  # Finished loading workload abject

    rootLogger.debug("::debug::Loading input parameters")
    print_left_message("Loading input parameters ...")
    input_parameters = load_parameters("INPUT", ms)
    print("{:>15}".format("ok"))  # Finished loading input parameters from metastore

    rootLogger.debug("::debug::Loading execution parameters file")
    print_left_message("Loading execution parameters ...")
    execution_parameters = load_parameters("EXECUTION", ms)
    print(
        "{:>15}".format("ok")
    )  # Finished loading execution  parameters from metastore

    step_name = parameters.INPUT_STEP_NAME
    print_left_message(f"Loading contract for '{step_name}.input' ...")
    input_object = load_contract_object(
        parameters=input_parameters,
        workflow_object=workflow_object,
        step_name=step_name,
        contract_type="input",
    )
    print(
        "{:>15}".format("ok")
    )  # Finished loading execution  parameters from metastore

    print(f"Attaching step info to input for '{step_name}.input' ... ")
    input_node_id = ms.attach_step_info(
        input_object,
        workflow_object.schema_version,
        workflow_node_id,
        step_name,
        "input",
    )
    print(f"     Input Node ID: {input_node_id}")  # Finished attaching step ID to input

    rootLogger.debug(f"Successfully saved: {input_object}")

    # TODO don't hard code any of these
    exec_dict = execution_parameters
    exec_dict["run_id"] = parameters.GITHUB_RUN_ID
    exec_dict["run_date"] = datetime.datetime.now()
    exec_dict["step_id"] = str(uuid.uuid4())

    print_left_message(f"Loading contract for '{step_name}.execution' ...")
    execution_object = load_contract_object(
        parameters=exec_dict,
        workflow_object=workflow_object,
        step_name=step_name,
        contract_type="execution",
    )
    print(
        "{:>15}".format("ok")
    )  # Finished loading execution  parameters from metastore

    rootLogger.debug(f"Successfully loaded and validated execution: {execution_object}")

    print(f"Attaching step info to input for '{step_name}.execution' ... ")
    execution_node_id = ms.attach_step_info(
        execution_object,
        workflow_object.schema_version,
        workflow_node_id,
        step_name,
        "execution",
    )
    rootLogger.debug(f"Successfully saved: {execution_object}")
    print(
        f"      Execution Node ID: {execution_node_id}"
    )  # Finished attaching step ID to input

    # Branching between use step_execution.py or execution file.
    execution_file = os.environ.get("INPUT_EXECUTION_FILE")

    print_left_message("Executing step ... ")
    print("{:>15}".format("ok"))  # Starting executing step
    results_ml_object = execute_step(
        execution_file,
        workflow_object,
        input_object,
        execution_object,
        step_name,
        parameters.GITHUB_RUN_ID,
    )
    print_left_message("Finished executing step ... ")
    print("{:>15}".format("ok"))  # Starting executing step

    # TODO: Need to add next and previous steps to attach_step_info
    print(f"Attaching step info to output for '{step_name}.output' ... ")
    output_node_id = ms.attach_step_info(
        results_ml_object,
        workflow_object.schema_version,
        workflow_node_id,
        step_name,
        "output",
    )
    print(
        f"      Output Node ID: {output_node_id}"
    )  # Finished attaching step ID to output

    dict_conversion = results_ml_object.dict_without_internal_variables()

    string_io_handle = StringIO()
    YAML.SafeDumper.add_representer(uuid.UUID, repr_uuid)
    YAML.safe_dump(dict_conversion, string_io_handle)
    yaml_conversion = string_io_handle.getvalue()

    encode_to_utf8_bytes = yaml_conversion.encode("utf-8")
    base64_encode = base64.urlsafe_b64encode(encode_to_utf8_bytes)
    final_encode_to_utf8 = str(base64_encode, "utf-8")

    # Recording raw log info
    # logBuffer.flush()
    # log_contents = logBuffer.getvalue()

    log_object = MLObject()
    log_object.set_type(schema_version="0.1.0", schema_type="log")
    log_object.run_id = parameters.GITHUB_RUN_ID
    log_object.step_name = step_name
    log_object.run_date = datetime.datetime.now()
    log_object.raw_log = (
        "NO RAW LOGS YET (NEED TO FIGURE OUT WHERE I CAN PUSH A LARGE OBJECT)"
    )
    # log_object.raw_log = log_contents
    log_object.log_property_bag = {}

    # errors = log_object.validate()

    log_node_id = ms.attach_step_info(
        log_object, workflow_object.schema_version, workflow_node_id, step_name, "log"
    )

    rootLogger.debug(
        f"::set-output name=output_raw::{results_ml_object.dict_without_internal_variables()}"
    )

    print("Printing output ... \n \n")
    logger = setupLogger()
    output_message = ""
    output_message += f"{logger.print_and_log('output_raw', results_ml_object.dict_without_internal_variables())}\n"
    output_message += (
        f"{logger.print_and_log('output_base64_encoded', final_encode_to_utf8)}\n"
    )
    output_message += f"{logger.print_and_log('input_node_id', input_node_id)}\n"
    output_message += (
        f"{logger.print_and_log('execution_node_id', execution_node_id)}\n"
    )
    output_message += f"{logger.print_and_log('output_node_id', output_node_id)}\n"
    output_message += f"{logger.print_and_log('log_node_id', log_node_id)}\n"

    rootLogger.debug(f"Complete output: \n {output_message}")
    print("\n\n... finished printing output")  # Finished printing output

    print_left_message("Generating /output_message.txt ...")
    if is_docker():
        Path("/output_message.txt").write_text(output_message)
    else:
        fp = tempfile.TemporaryFile()
        fp.write(output_message.encode("utf-8"))
    print("{:>15}".format("ok"))  # Finished printing output
Beispiel #28
0
from mlspeclib import MLSchema, MLObject
from random import randint, random

results_ml_object = MLObject()

results_ml_object.set_type(
    schema_type=result_ml_object_schema_type,
    schema_version=result_ml_object_schema_version,
)

# Execute code below. Examples:
"""
1) Execute on the worker node this step is running on

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

dataset = pd.read_csv('Weather.csv') # Example dataset

X = dataset['MinTemp'].values.reshape(-1,1)
y = dataset['MaxTemp'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train) # Training the algorithm

y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    def test_e2e(self):
        parameters_from_environment = {}

        integration_tests_dir = parameters_from_environment.get(
            "INPUT_INTEGRATION_TESTS_DIRECTORY", "integration")
        # parameters_dir_name = parameters_from_environment.get("INPUT_PARAMETERS_DIRECTORY", "/src/.parameters")
        variables_file_name = parameters_from_environment.get(
            "INPUT_INTEGRATION_TESTS_VARIABLE_FILE_NAME",
            "integration_test_variables.yaml",
        )

        print(os.environ)

        for var in os.environ:
            if "INPUT" in var:
                parameters_from_environment[var] = os.environ.get(var,
                                                                  default=None)

        parameters_from_file = {}
        parameters_file_location = Path(
            integration_tests_dir) / variables_file_name
        if parameters_file_location.exists():
            parameters_from_file = YAML.safe_load(
                parameters_file_location.read_text("utf-8"))

        # Building everything into parameters that we'll eventually write to environment variables to execute Docker
        parameters = {**parameters_from_file, **parameters_from_environment}
        schemas_dir_name = parameters.get(
            "INPUT_INTEGRATION_TESTS_SCHEMAS_DIR_NAME",
            "/src/parameters/schemas")

        repo_name = parameters.get("INPUT_CONTAINER_REPO_NAME")
        container_name = parameters.get("INPUT_CONTAINER_NAME")

        parameters["INPUT_WORKFLOW_VERSION"] = parameters.get(
            "INPUT_WORKFLOW_VERSION",
            str("999999999999.9." + str(random.randint(0, 9999))))
        workflow_version = parameters["INPUT_WORKFLOW_VERSION"]

        MLSchema.append_schema_to_registry(Path(schemas_dir_name))

        workflow_input = parameters.get("INPUT_WORKFLOW")
        if isinstance(workflow_input, dict):
            workflow_string = YAML.safe_dump(workflow_input)
        else:
            workflow_string = workflow_input

        workflow_dict = YAML.safe_load(workflow_string)
        workflow_dict["workflow_version"] = workflow_version
        workflow_dict["run_id"] = str(uuid.uuid4())
        parameters["GITHUB_RUN_ID"] = workflow_dict["run_id"]
        parameters["GITHUB_WORKSPACE"] = "/src"

        workflow_dict["step_id"] = str(uuid.uuid4())
        workflow_dict["run_date"] = datetime.datetime.now()

        workflow_string = YAML.safe_dump(workflow_dict)
        (workflow_object,
         errors) = MLObject.create_object_from_string(workflow_string)

        credentials_packed = parameters_from_environment.get(
            "INPUT_METASTORE_CREDENTIALS", None)

        if credentials_packed is None:
            credentials_packed = (Path(integration_tests_dir) /
                                  "metastore_credentials.yaml").read_text(
                                      encoding="utf-8")

        # TODO Sometimes secrets have no spacer. Should figure this out.
        parameters["INPUT_METASTORE_CREDENTIALS"] = credentials_packed

        ms = Metastore(credentials_packed)
        debug_args = ""
        environment_vars_list = []

        workflow_node_id = None
        try:
            workflow_node_id = ms.create_workflow_node(workflow_object,
                                                       workflow_dict["run_id"])
            ms.create_workflow_steps(workflow_node_id, workflow_object)
            parameters["INPUT_WORKFLOW_NODE_ID"] = workflow_node_id

            for param in parameters:
                if isinstance(parameters[param], dict):
                    env_value = YAML.safe_dump(parameters[param])
                else:
                    env_value = parameters[param]
                debug_args += f" -e '{param}={env_value}'"
                environment_vars_list.append("-e")
                environment_vars_list.append(f"{param}={env_value}")

            exec_statement = [
                "docker",
                "pull",
                f"{repo_name}/{container_name}",
            ]

            print(f"docker pull --no-cache {repo_name}/{container_name}")
            # self.rootLogger.debug(f"exec_statement = {exec_statement}")

            p = subprocess.Popen(exec_statement,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()
            # self.rootLogger.debug(f"out = {str(out)}")
            # self.rootLogger.debug(f"error = {str(err)}")
            # self.assertTrue(str(err, "utf-8") == "")

            exec_statement = (["docker", "run"] + environment_vars_list +
                              [f"{repo_name}/{container_name}"])

            # print(f"args statement: '{debug_args}'")
            print(
                f"docker run \\\n {debug_args} \\\n -ti --entrypoint=/bin/bash {repo_name}/{container_name}"
            )
            # self.rootLogger.debug(f"exec_statement = {exec_statement}")

            p = subprocess.Popen(exec_statement,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()
            self.rootLogger.debug(f"out = {str(out)}")
            self.rootLogger.debug(f"error = {str(err)}")
            self.assertTrue(str(err, "utf-8") == "")
            result = ms.execute_query(
                f"g.V().has('workflow_node_id', '{workflow_node_id}').count()")
            self.assertTrue(result[0] == 8)

        finally:
            try:
                if workflow_node_id is not None:
                    ms.execute_query(
                        f"g.V().has('workflow_node_id', '{workflow_node_id}').drop()"
                    )
            except ValueError:
                pass
    def test_process_data(self):
        """
        Full E2E of Process Data
        """
        # THESE SHOULD BE THE ONLY SETTINGS FOR THIS FILE
        step_name = "process_data"
        expected_results_schema_type = "data_result"  # MUST BE A LOADED SCHEMA
        expected_results_schema_version = "9999.0.1"  # MUST BE A SEMVER

        results_ml_object = MLObject()
        results_ml_object.set_type(
            schema_type=expected_results_schema_type,
            schema_version=expected_results_schema_version,
        )

        # Should error due to missing fields
        with self.assertRaises(ValueError) as context:
            verify_result_contract(
                results_ml_object,
                expected_results_schema_type,
                expected_results_schema_version,
                step_name,
            )

        self.assertTrue(
            f"Error verifying result object for '{step_name}.output'" in str(
                context.exception))

        results_ml_object = MLObject()

        result_ml_object_schema_type = expected_results_schema_type
        result_ml_object_schema_version = expected_results_schema_version

        exec(
            (Path("tests") / "sample_process_data_execution.py").read_text(),
            globals(),
            locals(),
        )

        results_ml_object.run_date = datetime.datetime.now()
        results_ml_object.step_id = str(uuid.uuid4())
        results_ml_object.run_id = str(uuid.uuid4())

        results_ml_object.execution_profile.system_memory_utilization = random(
        )
        results_ml_object.execution_profile.network_traffic_in_bytes = randint(
            7e9, 9e10)
        results_ml_object.execution_profile.gpu_temperature = randint(70, 130)
        results_ml_object.execution_profile.disk_io_utilization = random()
        results_ml_object.execution_profile.gpu_percent_of_time_accessing_memory = (
            random())
        results_ml_object.execution_profile.cpu_utilization = random()
        results_ml_object.execution_profile.gpu_utilization = random()
        results_ml_object.execution_profile.gpu_memory_allocation = random()

        self.assertTrue(
            verify_result_contract(
                results_ml_object,
                expected_results_schema_type,
                expected_results_schema_version,
                step_name,
            ))