def test_log_metadata_successfully_with_minimum_information(self): store = metadata.Store(grpc_host=GRPC_HOST, grpc_port=GRPC_PORT) ws1 = metadata.Workspace(store=store, name="ws_1") r = metadata.Run(workspace=ws1, name="first run") e = metadata.Execution(name="test execution", workspace=ws1, run=r) self.assertIsNotNone(e.id) data_set = e.log_input( metadata.DataSet(name="mytable-dump", uri="file://path/to/dataset")) self.assertIsNotNone(data_set.id) data_set_id = data_set.id # ID should not change after logging twice. e.log_input(data_set) self.assertEqual(data_set_id, data_set.id) metrics = e.log_output( metadata.Metrics(name="MNIST-evaluation", uri="gcs://my-bucket/mnist-eval.csv")) self.assertIsNotNone(metrics.id) metrics_id = metrics.id # ID should not change after logging twice. e.log_output(metrics) self.assertEqual(metrics_id, metrics.id) model = e.log_output( metadata.Model(name="MNIST", uri="gcs://my-bucket/mnist")) self.assertIsNotNone(model.id) model_id = model.id # ID should not change after logging twice. e.log_output(model) self.assertEqual(model_id, model.id)
def create_execution(): """ Prepare an execution for artifacts storage :return: """ metadata_store = metadata.Store( grpc_host=ModelUtil.METADATA_STORE_HOST, grpc_port=ModelUtil.METADATA_STORE_PORT) workspace = metadata.Workspace( # Connect to metadata service in namespace kubeflow in k8s cluster. store=metadata_store, name="workspace_1", description="a workspace for testing", labels={"n1": "v1"}) run = metadata.Run( workspace=workspace, name="run-" + datetime.utcnow().isoformat("T"), description="a run in ws_1", ) execution = metadata.Execution( name="execution" + datetime.utcnow().isoformat("T"), workspace=workspace, run=run, description="execution example", ) print("An execution was created with id %s" % execution.id) return execution
def run(self): super().run() exec = metadata.Execution('assign-dataset-metadata', workspace=self.metadata_workspace) ds = metadata.DataSet(name=self.args.dataset_name, uri=self.args.dataset_uri, version='0') exec.log_output(ds)
def save_checkpoint_metadata(self): exec = metadata.Execution( 'train', workspace=self.metadata_workspace ) model_md = metadata.Model( name=self.model_name, uri=self.checkpoint_root, version='0' ) exec.log_output(model_md)
def log_model_info(ws, ws_run, model_uri): exec2 = metadata.Execution( name="execution" + datetime.utcnow().isoformat("T"), workspace=ws, run=ws_run, description="train action", ) _ = exec2.log_input( metadata.Model(description="t2t model", name="t2t-model", owner="*****@*****.**", uri=model_uri, version="v1.0.0"))
def log_dataset_info(ws, ws_run, data_uri): exec1 = metadata.Execution( name="execution" + datetime.utcnow().isoformat("T"), workspace=ws, run=ws_run, description="copy action", ) _ = exec1.log_input( metadata.DataSet(description="gh summarization data", name="gh-summ-data", owner="*****@*****.**", uri=data_uri, version="v1.0.0"))
def test_artifact_deduplication(self): store = metadata.Store(grpc_host=GRPC_HOST, grpc_port=GRPC_PORT) ws1 = metadata.Workspace(store=store, name="workspace_one") ws2 = metadata.Workspace(store=store, name="workspace_two") r = metadata.Run(workspace=ws1, name="first run") e = metadata.Execution(name="test execution", workspace=ws1, run=r) e2 = metadata.Execution(name="execution 2", workspace=ws1) e3 = metadata.Execution(name="execution 3", workspace=ws2) self.assertIsNotNone(e.id) self.assertIsNotNone(e2.id) model = metadata.Model(name="MNIST", uri="gcs://my-bucket/mnist", model_type="neural network", version="v0.0.1") model2 = metadata.Model(name="MNIST", uri="gcs://my-bucket/mnist", model_type="neural network", version="v0.0.1") e.log_output(model) self.assertIsNotNone(model.id) e2.log_output(model2) self.assertIsNotNone(model2.id) self.assertEqual(model.id, model2.id)
def log_dataset_info(ws, ws_run, description, name, owner, data_uri, version, query, labels): exec = metadata.Execution( name="Execution" + datetime.utcnow().isoformat("T"), workspace=ws, run=ws_run, description="Dataset log exec.", ) dataset_log = exec.log_input( metadata.DataSet(description=description, name=name, owner=owner, uri=data_uri, version=version, query=query, labels=labels))
def log_metric_info(ws, ws_run, description, name, owner, metric_uri, data_set_id, model_id, metrics_type, values, labels): exec = metadata.Execution( name="Execution" + datetime.utcnow().isoformat("T"), workspace=ws, run=ws_run, description="Metric log exec.", ) metric_log = exec.log_input( metadata.Metrics(description=description, name=name, owner=owner, uri=metric_uri, data_set_id=data_set_id, model_id=model_id, metrics_type=metrics_type, values=values, labels=labels))
def create_metadata_execution(): global metadata # Create Metadata Workspace and a Exec to log details mnist_train_workspace = metadata.Workspace( # Connect to metadata service in namespace kubeflow in k8s cluster. store=metadata.Store(grpc_host=METADATA_STORE_HOST, grpc_port=METADATA_STORE_PORT), name="mnist train workspace", description="a workspace for training mnist", labels={"n1": "v1"}) run1 = metadata.Run(workspace=mnist_train_workspace, name="run-" + datetime.utcnow().isoformat("T"), description="a run in ws_1") exec = metadata.Execution(name="execution" + datetime.utcnow().isoformat("T"), workspace=mnist_train_workspace, run=run1, description="execution example") print("An execution was created with id %s" % exec.id) return exec
def test_log_metadata_successfully_with_minimum_information(self): ws1 = metadata.Workspace(backend_url_prefix="127.0.0.1:8080", name="ws_1") r = metadata.Run(workspace=ws1, name="first run") e = metadata.Execution(name="test execution", workspace=ws1, run=r) self.assertIsNotNone(e.id) data_set = e.log_input( metadata.DataSet(name="mytable-dump", uri="file://path/to/dataset")) self.assertIsNotNone(data_set.id) metrics = e.log_output( metadata.Metrics(name="MNIST-evaluation", uri="gcs://my-bucket/mnist-eval.csv")) self.assertIsNotNone(metrics.id) model = e.log_output( metadata.Model(name="MNIST", uri="gcs://my-bucket/mnist")) self.assertIsNotNone(model.id)
def log_model_info(ws, ws_run, description, name, owner, model_uri, version, hyperparameters, learning_rate, layers, early_stop, labels): exec = metadata.Execution( name="Execution" + datetime.utcnow().isoformat("T"), workspace=ws, run=ws_run, description="Model log exec.", ) model_log = exec.log_input( metadata.Model( description=description, name=name, owner=owner, uri=model_uri, version=version, hyperparameters=hyperparameters, learning_rate=learning_rate, layers=layers, early_stop=early_stop, labels=labels, ))
def test_log_invalid_artifacts_should_fail(self): store = metadata.Store(grpc_host=GRPC_HOST, grpc_port=GRPC_PORT) ws = metadata.Workspace(store=store, name="ws_1", description="a workspace for testing", labels={"n1": "v1"}) e = metadata.Execution(name="test execution", workspace=ws) artifact1 = ArtifactFixture( mlpb.Artifact(uri="gs://uri", custom_properties={ metadata._WORKSPACE_PROPERTY_NAME: mlpb.Value(string_value="ws1"), })) self.assertRaises(ValueError, e.log_input, artifact1) artifact2 = ArtifactFixture( mlpb.Artifact(uri="gs://uri", custom_properties={ metadata._RUN_PROPERTY_NAME: mlpb.Value(string_value="run1"), })) self.assertRaises(ValueError, e.log_output, artifact2)
def test_log_invalid_artifacts_should_fail(self): ws = metadata.Workspace( backend_url_prefix="127.0.0.1:8080", name="ws_1", description="a workspace for testing", labels={"n1": "v1"}) e = metadata.Execution(name="test execution", workspace=ws) artifact1 = ArtifactFixture(openapi_client.MlMetadataArtifact( uri="gs://uri", custom_properties={ metadata.WORKSPACE_PROPERTY_NAME: openapi_client.MlMetadataValue(string_value="ws1"), } )) self.assertRaises(ValueError, e.log_input, artifact1) artifact2 = ArtifactFixture(openapi_client.MlMetadataArtifact( uri="gs://uri", custom_properties={ metadata.RUN_PROPERTY_NAME: openapi_client.MlMetadataValue(string_value="run1"), } )) self.assertRaises(ValueError, e.log_output, artifact2)
args.model = 'model/' + args.model model_path = str( Path(args.base_path).resolve(strict=False).joinpath( args.model).resolve(strict=False)) data_path = Path(args.base_path).joinpath(args.data).resolve(strict=False) dataset = Path(args.base_path).joinpath(args.dataset) labels = {'run_id': args.run_id} ws = get_ws("Kubemlops", "MLops for Kubeflow") run = metadata.Run(workspace=ws, name="run-" + datetime.utcnow().isoformat("T"), description="Run for training TacosBurritos model") execution = metadata.Execution( name="execution" + datetime.utcnow().isoformat("T"), workspace=ws, run=run, description="tacos-burritos", ) print("An execution was created with id %s" % execution.id) info('Log model artifact') model_version = "model_version_" + str(uuid4()) log_model(args.model_name, model_path, model_version, execution, json.dumps(labels)) info('Log traning data set') log_dataset(args.dataset, str(dataset), execution, json.dumps(labels))
def test_log_metadata_successfully(self): ws1 = metadata.Workspace( backend_url_prefix="127.0.0.1:8080", name="ws_1", description="a workspace for testing", labels={"n1": "v1"}) r = metadata.Run( workspace=ws1, name="first run", description="first run in ws_1", ) e = metadata.Execution( name="test execution", workspace=ws1, run=r, description="an execution", ) self.assertIsNotNone(e.id) data_set = e.log_input( metadata.DataSet( description="an example data", name="mytable-dump", owner="*****@*****.**", uri="file://path/to/dataset", version="v1.0.0", query="SELECT * FROM mytable")) self.assertIsNotNone(data_set.id) metrics = e.log_output( metadata.Metrics( name="MNIST-evaluation", description="validating the MNIST model to recognize handwritten digits", owner="*****@*****.**", uri="gcs://my-bucket/mnist-eval.csv", data_set_id="123", model_id="12345", metrics_type=metadata.Metrics.VALIDATION, values={"accuracy": 0.95}, labels={"mylabel": "l1"})) self.assertIsNotNone(metrics.id) model = e.log_output( metadata.Model( name="MNIST", description="model to recognize handwritten digits", owner="*****@*****.**", uri="gcs://my-bucket/mnist", model_type="neural network", training_framework={ "name": "tensorflow", "version": "v1.0" }, hyperparameters={ "learning_rate": 0.5, "layers": [10, 3, 1], "early_stop": True }, version="v0.0.1", labels={"mylabel": "l1"})) self.assertIsNotNone(model.id) # Test listing artifacts in a workspace self.assertTrue(len(ws1.list()) > 0) self.assertTrue(len(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME)) > 0) self.assertTrue(len(ws1.list(metadata.Metrics.ARTIFACT_TYPE_NAME)) > 0) self.assertTrue(len(ws1.list(metadata.DataSet.ARTIFACT_TYPE_NAME)) > 0) # Test lineage tracking. output_events = ws1.client.list_events2(model.id).events assert len(output_events) == 1 execution_id = output_events[0].execution_id assert execution_id == e.id all_events = ws1.client.list_events(execution_id).events assert len(all_events) == 3
def _get_or_create_run_execution(self, md_workspace, run_name, exec_name): return metadata.Execution(name=exec_name, workspace=md_workspace, run=run_name, description="Run %s" % exec_name)
def test_log_metadata_successfully(self): store = metadata.Store(grpc_host=GRPC_HOST, grpc_port=GRPC_PORT) ws1 = metadata.Workspace(store=store, name="test_log_metadata_successfully_ws", description="a workspace for testing", labels={"n1": "v1"}) r = metadata.Run( workspace=ws1, name="first run", description="first run in ws_1", ) trainer = metadata.Execution( name="test execution", workspace=ws1, run=r, description="an execution", ) self.assertIsNotNone(trainer.id) data_set = trainer.log_input( metadata.DataSet(description="an example data", name="mytable-dump", owner="*****@*****.**", uri="file://path/to/dataset", version=str(uuid.uuid4()), query="SELECT * FROM mytable")) self.assertIsNotNone(data_set.id) self.assertIsNotNone(repr(data_set)) metrics = trainer.log_output( metadata.Metrics( name="MNIST-evaluation", description= "validating the MNIST model to recognize handwritten digits", owner="*****@*****.**", uri="gcs://my-bucket/mnist-eval.csv", data_set_id="123", model_id="12345", metrics_type=metadata.Metrics.VALIDATION, values={"accuracy": 0.95}, labels={"mylabel": "l1"})) self.assertIsNotNone(metrics.id) self.assertIsNotNone(repr(metrics)) model_version = str(uuid.uuid4()) model = trainer.log_output( metadata.Model(name="MNIST", description="model to recognize handwritten digits", owner="*****@*****.**", uri="gcs://my-bucket/mnist", model_type="neural network", training_framework={ "name": "tensorflow", "version": "v1.0" }, hyperparameters={ "learning_rate": 0.5, "layers": [10, 3, 1], "early_stop": True }, version=model_version, labels={"mylabel": "l1"})) self.assertIsNotNone(model.id) self.assertIsNotNone(repr(model)) serving_application = metadata.Execution( name="serving model", workspace=ws1, description="an execution to represent model serving component", ) self.assertIsNotNone(serving_application.id) # Use model name, version, uri to uniquely identify existing model. served_model = metadata.Model( name="MNIST", uri="gcs://my-bucket/mnist", version=model_version, ) serving_application.log_input(served_model) # Test listing artifacts in a workspace self.assertTrue(len(ws1.list()) > 0) self.assertTrue(len(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME)) > 0) self.assertTrue(len(ws1.list(metadata.Metrics.ARTIFACT_TYPE_NAME)) > 0) self.assertTrue(len(ws1.list(metadata.DataSet.ARTIFACT_TYPE_NAME)) > 0) # Test lineage tracking. model_events = ws1.store.get_events_by_artifact_ids([model.id]) self.assertEqual(len(model_events), 2) execution_ids = set(e.execution_id for e in model_events) assert execution_ids == set([serving_application.id, trainer.id]) trainer_events = ws1.store.get_events_by_execution_ids([trainer.id]) artifact_ids = set(e.artifact_id for e in trainer_events) assert artifact_ids == set([model.id, metrics.id, data_set.id])