Exemple #1
0
    def testFullTaxiGcpPipeline(self):
        pipeline_name = 'gcp-perf-test-full-e2e-test-{}'.format(
            test_utils.random_id())

        # Custom CAIP training job using a testing image.
        ai_platform_training_args = {
            'project': self._GCP_PROJECT_ID,
            'region': self._GCP_REGION,
            'scaleTier': 'CUSTOM',
            'masterType': 'large_model',
            'masterConfig': {
                'imageUri': self._CONTAINER_IMAGE
            },
            'workerType': self._WORKER_TYPE,
            'parameterServerType': 'standard',
            'workerCount': self._WORKER_COUNT,
            'parameterServerCount': self._PARAMETER_SERVER_COUNT
        }

        pipeline = taxi_pipeline_kubeflow_gcp.create_pipeline(
            pipeline_name=pipeline_name,
            pipeline_root=self._pipeline_root(pipeline_name),
            module_file=self._MODULE_FILE,
            ai_platform_training_args=ai_platform_training_args,
            ai_platform_serving_args=self._AI_PLATFORM_SERVING_ARGS,
            beam_pipeline_args=self._BEAM_PIPELINE_ARGS)
        self._compile_and_run_pipeline(
            pipeline=pipeline,
            query_sample_rate=1,
            # (1M * batch_size=200) / 200M records ~ 1 epoch
            train_steps=1000000,
            eval_steps=10000,
            worker_count=20,
            parameter_server_count=3,
        )
    def testTaxiPipelineConstructionAndDefinitionFileExists(self):
        logical_pipeline = taxi_pipeline_kubeflow_gcp.create_pipeline(
            pipeline_name=taxi_pipeline_kubeflow_gcp._pipeline_name,
            pipeline_root=taxi_pipeline_kubeflow_gcp._pipeline_root,
            module_file=taxi_pipeline_kubeflow_gcp._module_file,
            ai_platform_training_args=taxi_pipeline_kubeflow_gcp.
            _ai_platform_training_args,
            ai_platform_serving_args=taxi_pipeline_kubeflow_gcp.
            _ai_platform_serving_args)
        self.assertEqual(9, len(logical_pipeline.components))

        KubeflowDagRunner().run(logical_pipeline)
        file_path = os.path.join(self._tmp_dir,
                                 'chicago_taxi_pipeline_kubeflow_gcp.tar.gz')
        self.assertTrue(tf.io.gfile.exists(file_path))
    def testFullTaxiGcpPipeline(self):
        pipeline_name = 'gcp-perf-test-full-e2e-test-{}'.format(
            test_utils.random_id())

        # Custom CAIP training job using a testing image.
        ai_platform_training_args = {
            'project': self._GCP_PROJECT_ID,
            'region': self._GCP_REGION,
            'scaleTier': 'CUSTOM',
            'masterType': 'large_model',
            'masterConfig': {
                'imageUri': self.container_image
            },
            'workerType': self._WORKER_TYPE,
            'parameterServerType': 'standard',
            'workerCount': self._WORKER_COUNT,
            'parameterServerCount': self._PARAMETER_SERVER_COUNT
        }

        pipeline = taxi_pipeline_kubeflow_gcp.create_pipeline(
            pipeline_name=pipeline_name,
            pipeline_root=self._pipeline_root(pipeline_name),
            module_file=self._MODULE_FILE,
            ai_platform_training_args=ai_platform_training_args,
            ai_platform_serving_args=self._AI_PLATFORM_SERVING_ARGS,
            beam_pipeline_args=self._BEAM_PIPELINE_ARGS)
        # TODO(b/162451308): Add this clean-up back after we re-enable AIP pusher
        # when AIP prediction service supports TF>=2.3.
        # self.addCleanup(kubeflow_test_utils.delete_ai_platform_model,
        #                 self._MODEL_NAME)
        self._compile_and_run_pipeline(
            pipeline=pipeline,
            query_sample_rate=1,
            # (1M * batch_size=200) / 200M records ~ 1 epoch
            train_steps=1000000,
            eval_steps=10000,
            worker_count=20,
            parameter_server_count=3,
        )