def custom_training_job_sample(
    project: str,
    location: str,
    bucket: str,
    display_name: str,
    script_path: str,
    script_args: str,
    container_uri: str,
    model_serving_container_image_uri: str,
    requirements: str,
    replica_count: int,
):
    aiplatform.init(project=project, location=location, staging_bucket=bucket)

    job = aiplatform.CustomTrainingJob(
        display_name=display_name,
        script_path=script_path,
        container_uri=container_uri,
        requirements=requirements,
        model_serving_container_image_uri=model_serving_container_image_uri,
    )

    model = job.run(args=script_args,
                    replica_count=replica_count,
                    model_display_name=display_name)

    return model
Ejemplo n.º 2
0
def train_model(bucket_name: str) -> str:
    aiplatform.init(project=PROJECT, staging_bucket=bucket_name)
    job = aiplatform.CustomTrainingJob(
        display_name="climate_script_colab",
        script_path="task.py",
        container_uri="us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-7:latest",
    )

    job.run(
        accelerator_type="NVIDIA_TESLA_K80",
        accelerator_count=1,
        args=[f"--bucket={bucket_name}"],
    )

    logging.info(f"train_model resource_name: {job.resource_name}")

    # Wait until the model training job finishes.
    status = None
    logging.info("Waiting for model to train.")
    for _ in range(0, TIMEOUT_SEC, POLL_INTERVAL_SEC):
        # https://googleapis.dev/python/aiplatform/latest/aiplatform_v1/job_service.html
        status = job.state.name
        if status in VERTEX_AI_FINISHED_STATE:
            break
        time.sleep(POLL_INTERVAL_SEC)

    logging.info(f"Model job finished with status {status}")
    assert status == VERTEX_AI_SUCCESS_STATE
    yield job.resource_name
Ejemplo n.º 3
0
def train_custom_model(data_set,
                       timestamp,
                       develop_mode,
                       cpu_only_mode,
                       tf_version,
                       extra_args=None):
    # Set up training and deployment infra

    if cpu_only_mode:
        train_image = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.{}:latest'.format(
            tf_version)
        deploy_image = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.{}:latest'.format(
            tf_version)
    else:
        train_image = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.{}:latest".format(
            tf_version)
        deploy_image = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.{}:latest".format(
            tf_version)

    # train
    model_display_name = '{}-{}'.format(ENDPOINT_NAME, timestamp)
    job = aiplatform.CustomTrainingJob(
        display_name='train-{}'.format(model_display_name),
        script_path="model.py",
        container_uri=train_image,
        requirements=['cloudml-hypertune'],  # any extra Python packages
        model_serving_container_image_uri=deploy_image)
    model_args = [
        '--bucket',
        BUCKET,
    ]
    if develop_mode:
        model_args += ['--develop']
    if extra_args:
        model_args += extra_args

    if cpu_only_mode:
        model = job.run(
            dataset=data_set,
            # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html#
            predefined_split_column_name='data_split',
            model_display_name=model_display_name,
            args=model_args,
            replica_count=1,
            machine_type='n1-standard-4',
            sync=develop_mode)
    else:
        model = job.run(
            dataset=data_set,
            # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html#
            predefined_split_column_name='data_split',
            model_display_name=model_display_name,
            args=model_args,
            replica_count=1,
            machine_type='n1-standard-4',
            # See https://cloud.google.com/vertex-ai/docs/general/locations#accelerators
            accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4.name,
            accelerator_count=1,
            sync=develop_mode)
    return model
Ejemplo n.º 4
0
def create_training_pipeline_custom_job_sample(
    project: str,
    location: str,
    staging_bucket: str,
    display_name: str,
    script_path: str,
    container_uri: str,
    model_serving_container_image_uri: str,
    dataset_id: Optional[str] = None,
    model_display_name: Optional[str] = None,
    args: Optional[List[Union[str, float, int]]] = None,
    replica_count: int = 0,
    machine_type: str = "n1-standard-4",
    accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
    accelerator_count: int = 0,
    training_fraction_split: float = 0.8,
    validation_fraction_split: float = 0.1,
    test_fraction_split: float = 0.1,
    sync: bool = True,
):
    aiplatform.init(project=project,
                    location=location,
                    staging_bucket=staging_bucket)

    job = aiplatform.CustomTrainingJob(
        display_name=display_name,
        script_path=script_path,
        container_uri=container_uri,
        model_serving_container_image_uri=model_serving_container_image_uri,
    )

    # This example uses an ImageDataset, but you can use another type
    dataset = aiplatform.ImageDataset(dataset_id) if dataset_id else None

    model = job.run(
        dataset=dataset,
        model_display_name=model_display_name,
        args=args,
        replica_count=replica_count,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        training_fraction_split=training_fraction_split,
        validation_fraction_split=validation_fraction_split,
        test_fraction_split=test_fraction_split,
        sync=sync,
    )

    model.wait()

    print(model.display_name)
    print(model.resource_name)
    print(model.uri)
    return model
Ejemplo n.º 5
0
    def test_dataset_create_to_model_predict(
        self,
        create_dataset_mock,  # noqa: F811
        import_data_mock,  # noqa: F811
        predict_client_predict_mock,  # noqa: F811
        mock_python_package_to_gcs,  # noqa: F811
        mock_pipeline_service_create,  # noqa: F811
        mock_model_service_get,  # noqa: F811
        mock_pipeline_service_get,  # noqa: F811
        sync,
    ):

        aiplatform.init(
            project=test_datasets._TEST_PROJECT,
            staging_bucket=test_training_jobs._TEST_BUCKET_NAME,
            credentials=test_training_jobs._TEST_CREDENTIALS,
        )

        my_dataset = aiplatform.ImageDataset.create(
            display_name=test_datasets._TEST_DISPLAY_NAME,
            encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME,
            sync=sync,
        )

        my_dataset.import_data(
            gcs_source=test_datasets._TEST_SOURCE_URI_GCS,
            import_schema_uri=test_datasets._TEST_IMPORT_SCHEMA_URI,
            data_item_labels=test_datasets._TEST_DATA_LABEL_ITEMS,
            sync=sync,
        )

        job = aiplatform.CustomTrainingJob(
            display_name=test_training_jobs._TEST_DISPLAY_NAME,
            script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME,
            container_uri=test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
            model_serving_container_image_uri=test_training_jobs.
            _TEST_SERVING_CONTAINER_IMAGE,
            model_serving_container_predict_route=test_training_jobs.
            _TEST_SERVING_CONTAINER_PREDICTION_ROUTE,
            model_serving_container_health_route=test_training_jobs.
            _TEST_SERVING_CONTAINER_HEALTH_ROUTE,
        )

        model_from_job = job.run(
            dataset=my_dataset,
            base_output_dir=test_training_jobs._TEST_BASE_OUTPUT_DIR,
            args=test_training_jobs._TEST_RUN_ARGS,
            replica_count=1,
            machine_type=test_training_jobs._TEST_MACHINE_TYPE,
            accelerator_type=test_training_jobs._TEST_ACCELERATOR_TYPE,
            accelerator_count=test_training_jobs._TEST_ACCELERATOR_COUNT,
            model_display_name=test_training_jobs._TEST_MODEL_DISPLAY_NAME,
            training_fraction_split=test_training_jobs.
            _TEST_TRAINING_FRACTION_SPLIT,
            validation_fraction_split=test_training_jobs.
            _TEST_VALIDATION_FRACTION_SPLIT,
            test_fraction_split=test_training_jobs._TEST_TEST_FRACTION_SPLIT,
            sync=sync,
        )

        created_endpoint = models.Endpoint.create(
            display_name=test_endpoints._TEST_DISPLAY_NAME,
            encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME,
            sync=sync,
        )

        my_endpoint = model_from_job.deploy(
            encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME, sync=sync)

        endpoint_deploy_return = created_endpoint.deploy(model_from_job,
                                                         sync=sync)

        assert endpoint_deploy_return is None

        if not sync:
            my_endpoint.wait()
            created_endpoint.wait()

        test_prediction = created_endpoint.predict(instances=[[1.0, 2.0, 3.0],
                                                              [1.0, 3.0, 4.0]],
                                                   parameters={"param": 3.0})

        true_prediction = models.Prediction(
            predictions=test_endpoints._TEST_PREDICTION,
            deployed_model_id=test_endpoints._TEST_ID,
        )

        assert true_prediction == test_prediction
        predict_client_predict_mock.assert_called_once_with(
            endpoint=test_endpoints._TEST_ENDPOINT_NAME,
            instances=[[1.0, 2.0, 3.0], [1.0, 3.0, 4.0]],
            parameters={"param": 3.0},
        )

        expected_dataset = gca_dataset.Dataset(
            display_name=test_datasets._TEST_DISPLAY_NAME,
            metadata_schema_uri=test_datasets.
            _TEST_METADATA_SCHEMA_URI_NONTABULAR,
            metadata=test_datasets._TEST_NONTABULAR_DATASET_METADATA,
            encryption_spec=_TEST_ENCRYPTION_SPEC,
        )

        expected_import_config = gca_dataset.ImportDataConfig(
            gcs_source=gca_io.GcsSource(
                uris=[test_datasets._TEST_SOURCE_URI_GCS]),
            import_schema_uri=test_datasets._TEST_IMPORT_SCHEMA_URI,
            data_item_labels=test_datasets._TEST_DATA_LABEL_ITEMS,
        )

        create_dataset_mock.assert_called_once_with(
            parent=test_datasets._TEST_PARENT,
            dataset=expected_dataset,
            metadata=test_datasets._TEST_REQUEST_METADATA,
        )

        import_data_mock.assert_called_once_with(
            name=test_datasets._TEST_NAME,
            import_configs=[expected_import_config])

        expected_dataset.name = test_datasets._TEST_NAME
        assert my_dataset._gca_resource == expected_dataset

        mock_python_package_to_gcs.assert_called_once_with(
            gcs_staging_dir=test_training_jobs._TEST_BUCKET_NAME,
            project=test_training_jobs._TEST_PROJECT,
            credentials=initializer.global_config.credentials,
        )

        true_args = test_training_jobs._TEST_RUN_ARGS

        true_worker_pool_spec = {
            "replica_count": test_training_jobs._TEST_REPLICA_COUNT,
            "machine_spec": {
                "machine_type": test_training_jobs._TEST_MACHINE_TYPE,
                "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
                "accelerator_count":
                test_training_jobs._TEST_ACCELERATOR_COUNT,
            },
            "python_package_spec": {
                "executor_image_uri":
                test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
                "python_module":
                source_utils._TrainingScriptPythonPackager.module_name,
                "package_uris":
                [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH],
                "args":
                true_args,
            },
        }

        true_fraction_split = gca_training_pipeline.FractionSplit(
            training_fraction=test_training_jobs._TEST_TRAINING_FRACTION_SPLIT,
            validation_fraction=test_training_jobs.
            _TEST_VALIDATION_FRACTION_SPLIT,
            test_fraction=test_training_jobs._TEST_TEST_FRACTION_SPLIT,
        )

        true_container_spec = gca_model.ModelContainerSpec(
            image_uri=test_training_jobs._TEST_SERVING_CONTAINER_IMAGE,
            predict_route=test_training_jobs.
            _TEST_SERVING_CONTAINER_PREDICTION_ROUTE,
            health_route=test_training_jobs.
            _TEST_SERVING_CONTAINER_HEALTH_ROUTE,
        )

        true_managed_model = gca_model.Model(
            display_name=test_training_jobs._TEST_MODEL_DISPLAY_NAME,
            container_spec=true_container_spec,
        )

        true_input_data_config = gca_training_pipeline.InputDataConfig(
            fraction_split=true_fraction_split,
            dataset_id=my_dataset.name,
            gcs_destination=gca_io.GcsDestination(
                output_uri_prefix=test_training_jobs._TEST_BASE_OUTPUT_DIR),
        )

        true_training_pipeline = gca_training_pipeline.TrainingPipeline(
            display_name=test_training_jobs._TEST_DISPLAY_NAME,
            training_task_definition=schema.training_job.definition.
            custom_task,
            training_task_inputs=json_format.ParseDict(
                {
                    "worker_pool_specs": [true_worker_pool_spec],
                    "base_output_directory": {
                        "output_uri_prefix":
                        test_training_jobs._TEST_BASE_OUTPUT_DIR
                    },
                },
                struct_pb2.Value(),
            ),
            model_to_upload=true_managed_model,
            input_data_config=true_input_data_config,
        )

        mock_pipeline_service_create.assert_called_once_with(
            parent=initializer.global_config.common_location_path(),
            training_pipeline=true_training_pipeline,
        )

        assert job._gca_resource is mock_pipeline_service_get.return_value

        mock_model_service_get.assert_called_once_with(
            name=test_training_jobs._TEST_MODEL_NAME)

        assert model_from_job._gca_resource is mock_model_service_get.return_value

        assert job.get_model(
        )._gca_resource is mock_model_service_get.return_value

        assert not job.has_failed

        assert job.state == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
    def test_end_to_end_tabular(self, shared_state):
        """Build dataset, train a custom and AutoML model, deploy, and get predictions"""

        assert shared_state["bucket"]
        bucket = shared_state["bucket"]

        blob = bucket.blob(_BLOB_PATH)

        # Download the CSV file into memory and save it directory to staging bucket
        with request.urlopen(_DATASET_SRC) as response:
            data = response.read()
            blob.upload_from_string(data)

        # Collection of resources generated by this test, to be deleted during teardown
        shared_state["resources"] = []

        aiplatform.init(
            project=e2e_base._PROJECT,
            location=e2e_base._LOCATION,
            staging_bucket=shared_state["staging_bucket_name"],
        )

        # Create and import to single managed dataset for both training jobs

        ds = aiplatform.TabularDataset.create(
            display_name=f"{self._temp_prefix}-dataset-{uuid.uuid4()}",
            gcs_source=[
                f'gs://{shared_state["staging_bucket_name"]}/{_BLOB_PATH}'
            ],
            sync=False,
        )

        shared_state["resources"].extend([ds])

        # Define both training jobs

        custom_job = aiplatform.CustomTrainingJob(
            display_name=
            f"{self._temp_prefix}-train-housing-custom-{uuid.uuid4()}",
            script_path=_LOCAL_TRAINING_SCRIPT_PATH,
            container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest",
            requirements=["gcsfs==0.7.1"],
            model_serving_container_image_uri=
            "gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest",
        )

        automl_job = aiplatform.AutoMLTabularTrainingJob(
            display_name=
            f"{self._temp_prefix}-train-housing-automl-{uuid.uuid4()}",
            optimization_prediction_type="regression",
            optimization_objective="minimize-rmse",
        )

        # Kick off both training jobs, AutoML job will take approx one hour to run

        custom_model = custom_job.run(
            ds,
            replica_count=1,
            model_display_name=
            f"{self._temp_prefix}-custom-housing-model-{uuid.uuid4()}",
            sync=False,
        )

        automl_model = automl_job.run(
            dataset=ds,
            target_column="median_house_value",
            model_display_name=
            f"{self._temp_prefix}-automl-housing-model-{uuid.uuid4()}",
            sync=False,
        )

        shared_state["resources"].extend(
            [automl_job, automl_model, custom_job, custom_model])

        # Deploy both models after training completes
        custom_endpoint = custom_model.deploy(machine_type="n1-standard-4",
                                              sync=False)
        automl_endpoint = automl_model.deploy(machine_type="n1-standard-4",
                                              sync=False)
        shared_state["resources"].extend([automl_endpoint, custom_endpoint])

        # Send online prediction with same instance to both deployed models
        # This sample is taken from an observation where median_house_value = 94600
        custom_endpoint.wait()
        custom_prediction = custom_endpoint.predict([
            {
                "longitude": -124.35,
                "latitude": 40.54,
                "housing_median_age": 52.0,
                "total_rooms": 1820.0,
                "total_bedrooms": 300.0,
                "population": 806,
                "households": 270.0,
                "median_income": 3.014700,
            },
        ])
        automl_endpoint.wait()
        automl_prediction = automl_endpoint.predict([
            {
                "longitude": "-124.35",
                "latitude": "40.54",
                "housing_median_age": "52.0",
                "total_rooms": "1820.0",
                "total_bedrooms": "300.0",
                "population": "806",
                "households": "270.0",
                "median_income": "3.014700",
            },
        ])

        # Ensure a single prediction was returned
        assert len(custom_prediction.predictions) == 1
        assert len(automl_prediction.predictions) == 1

        # Ensure the models are remotely accurate
        try:
            automl_result = automl_prediction.predictions[0]["value"]
            custom_result = custom_prediction.predictions[0][0]
            assert 200000 > automl_result > 50000
            assert 200000 > custom_result > 50000
        except KeyError as e:
            raise RuntimeError("Unexpected prediction response structure:", e)
Ejemplo n.º 7
0
    def test_end_to_end_tabular(self, shared_state):
        """Build dataset, train a custom and AutoML model, deploy, and get predictions"""

        assert shared_state["bucket"]
        bucket = shared_state["bucket"]

        blob = bucket.blob(_BLOB_PATH)

        # Download the CSV file into memory and save it directory to staging bucket
        with request.urlopen(_DATASET_SRC) as response:
            data = response.read()
            blob.upload_from_string(data)

        # Collection of resources generated by this test, to be deleted during teardown
        shared_state["resources"] = []

        aiplatform.init(
            project=e2e_base._PROJECT,
            location=e2e_base._LOCATION,
            staging_bucket=shared_state["staging_bucket_name"],
        )

        # Create and import to single managed dataset for both training jobs

        dataset_gcs_source = f'gs://{shared_state["staging_bucket_name"]}/{_BLOB_PATH}'

        ds = aiplatform.TabularDataset.create(
            display_name=self._make_display_name("dataset"),
            gcs_source=[dataset_gcs_source],
            sync=False,
            create_request_timeout=180.0,
        )

        shared_state["resources"].extend([ds])

        # Define both training jobs

        custom_job = aiplatform.CustomTrainingJob(
            display_name=self._make_display_name("train-housing-custom"),
            script_path=_LOCAL_TRAINING_SCRIPT_PATH,
            container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest",
            requirements=["gcsfs==0.7.1"],
            model_serving_container_image_uri=
            "gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest",
        )

        automl_job = aiplatform.AutoMLTabularTrainingJob(
            display_name=self._make_display_name("train-housing-automl"),
            optimization_prediction_type="regression",
            optimization_objective="minimize-rmse",
        )

        # Kick off both training jobs, AutoML job will take approx one hour to run

        custom_model = custom_job.run(
            ds,
            replica_count=1,
            model_display_name=self._make_display_name("custom-housing-model"),
            timeout=1234,
            restart_job_on_worker_restart=True,
            enable_web_access=True,
            sync=False,
            create_request_timeout=None,
        )

        automl_model = automl_job.run(
            dataset=ds,
            target_column="median_house_value",
            model_display_name=self._make_display_name("automl-housing-model"),
            sync=False,
        )

        shared_state["resources"].extend(
            [automl_job, automl_model, custom_job, custom_model])

        # Deploy both models after training completes
        custom_endpoint = custom_model.deploy(machine_type="n1-standard-4",
                                              sync=False)
        automl_endpoint = automl_model.deploy(machine_type="n1-standard-4",
                                              sync=False)
        shared_state["resources"].extend([automl_endpoint, custom_endpoint])

        custom_batch_prediction_job = custom_model.batch_predict(
            job_display_name=self._make_display_name("automl-housing-model"),
            instances_format="csv",
            machine_type="n1-standard-4",
            gcs_source=dataset_gcs_source,
            gcs_destination_prefix=
            f'gs://{shared_state["staging_bucket_name"]}/bp_results/',
            sync=False,
        )

        shared_state["resources"].append(custom_batch_prediction_job)

        in_progress_done_check = custom_job.done()
        custom_job.wait_for_resource_creation()

        automl_job.wait_for_resource_creation()
        custom_batch_prediction_job.wait_for_resource_creation()

        # Send online prediction with same instance to both deployed models
        # This sample is taken from an observation where median_house_value = 94600
        custom_endpoint.wait()

        # Check scheduling is correctly set
        assert (custom_job._gca_resource.training_task_inputs["scheduling"]
                ["timeout"] == "1234s")
        assert (custom_job._gca_resource.training_task_inputs["scheduling"]
                ["restartJobOnWorkerRestart"] is True)

        custom_prediction = custom_endpoint.predict([_INSTANCE], timeout=180.0)

        custom_batch_prediction_job.wait()

        automl_endpoint.wait()
        automl_prediction = automl_endpoint.predict(
            [{k: str(v)
              for k, v in _INSTANCE.items()}],  # Cast int values to strings
            timeout=180.0,
        )

        # Test lazy loading of Endpoint, check getter was never called after predict()
        custom_endpoint = aiplatform.Endpoint(custom_endpoint.resource_name)
        custom_endpoint.predict([_INSTANCE])

        completion_done_check = custom_job.done()
        assert custom_endpoint._skipped_getter_call()

        assert (custom_job.state ==
                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED)
        assert (automl_job.state ==
                gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED)
        assert (custom_batch_prediction_job.state ==
                gca_job_state.JobState.JOB_STATE_SUCCEEDED)

        # Ensure a single prediction was returned
        assert len(custom_prediction.predictions) == 1
        assert len(automl_prediction.predictions) == 1

        # Ensure the models are remotely accurate
        try:
            automl_result = automl_prediction.predictions[0]["value"]
            custom_result = custom_prediction.predictions[0][0]
            assert 200000 > automl_result > 50000
            assert 200000 > custom_result > 50000
        except KeyError as e:
            raise RuntimeError("Unexpected prediction response structure:", e)

        # Check done() method works correctly
        assert in_progress_done_check is False
        assert completion_done_check is True