def test_reuse_default_cluster_if_not_configured(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions() # Pipeline is not configured to run on Cloud. p = beam.Pipeline(runner=runner, options=options) meta = ClusterMetadata(project_id='test-project', region='test-region') meta.master_url = 'test-url' meta.dashboard = 'test-dashboard' dcm = DataprocClusterManager(meta) # Configure the clusters so that a default cluster is known. clusters.dataproc_cluster_managers[meta] = dcm clusters.set_default_cluster(meta) runner.configure_for_flink(p, options) # The default cluster is used. tuned_meta = clusters.cluster_metadata(p) self.assertIs(tuned_meta, clusters.default_cluster_metadata) # The pipeline is known. self.assertIn(p, clusters.pipelines) registered_dcm = clusters.pipelines[p] self.assertIn(p, registered_dcm.pipelines) # The pipeline options is tuned for execution on the cluster. flink_options = options.view_as(FlinkRunnerOptions) self.assertEqual(flink_options.flink_master, tuned_meta.master_url) self.assertEqual(flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
def test_describe_by_cluster_identifier(self): known_meta = ClusterMetadata(project_id='test-project') known_meta2 = ClusterMetadata(project_id='test-project', region='some-other-region') dcm = self.clusters.create(known_meta) dcm2 = self.clusters.create(known_meta2) p = beam.Pipeline() p2 = beam.Pipeline() self.clusters.pipelines[p] = dcm dcm.pipelines.add(p) self.clusters.pipelines[p2] = dcm2 dcm.pipelines.add(p2) cid_pipeline = p meta = self.clusters.describe(cid_pipeline) self.assertIs(meta, known_meta) cid_master_url = known_meta.master_url meta = self.clusters.describe(cid_master_url) self.assertIs(meta, known_meta) cid_meta = ClusterMetadata(project_id=known_meta.project_id, region=known_meta.region, cluster_name=known_meta.cluster_name) meta = self.clusters.describe(cid_meta) self.assertIs(meta, known_meta)
def test_describe_everything(self): meta = ClusterMetadata(project_id='test-project') meta2 = ClusterMetadata(project_id='test-project', region='some-other-region') _ = self.clusters.create(meta) _ = self.clusters.create(meta2) meta_list = self.clusters.describe() self.assertEqual([meta, meta2], meta_list)
def _worker_options_to_cluster_metadata(self, options: PipelineOptions, cluster_metadata: ClusterMetadata): worker_options = options.view_as(WorkerOptions) if worker_options.subnetwork: cluster_metadata.subnetwork = worker_options.subnetwork if worker_options.num_workers: cluster_metadata.num_workers = worker_options.num_workers if worker_options.machine_type: cluster_metadata.machine_type = worker_options.machine_type
def test_force_cleanup_everything(self): meta = ClusterMetadata(project_id='test-project') meta2 = ClusterMetadata(project_id='test-project-2') _ = self.clusters.create(meta) _ = self.clusters.create(meta2) self.clusters.cleanup(force=True) self.assertEqual(self.m_delete_cluster.call_count, 2) self.assertNotIn(meta, self.clusters.dataproc_cluster_managers) self.assertNotIn(meta2, self.clusters.dataproc_cluster_managers) self.assertIsNone(self.clusters.default_cluster_metadata)
def test_cleanup_by_meta(self): known_meta = ClusterMetadata(project_id='test-project', region='test-region') _ = self.clusters.create(known_meta) meta = ClusterMetadata(project_id=known_meta.project_id, region=known_meta.region, cluster_name=known_meta.cluster_name) self.clusters.cleanup(meta) self.m_delete_cluster.assert_called_once() self.assertNotIn(known_meta, self.clusters.dataproc_cluster_managers) self.assertNotIn(known_meta.master_url, self.clusters.master_urls) self.assertIsNone(self.clusters.default_cluster_metadata)
def test_get_master_url_and_dashboard(self, mock_parse_method): """ Tests that get_master_url_and_dashboard detect the line containing the unique substring which identifies the location of the master_url and application id of the Flink master. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) cluster_manager._fs = MockFileSystem() cluster_metadata._staging_directory = 'test-staging-bucket' master_url, dashboard = cluster_manager.get_master_url_and_dashboard() self.assertEqual(master_url, 'test-master-url') self.assertEqual(dashboard, 'test-dashboard-link')
def test_cleanup_by_a_pipeline(self): meta = ClusterMetadata(project_id='test-project') dcm = self.clusters.create(meta) # Set up the association between a pipeline and a cluster. # In real code, it's set by the runner the 1st time a pipeline is executed. options = PipelineOptions() options.view_as(FlinkRunnerOptions).flink_master = meta.master_url p = beam.Pipeline(options=options) self.clusters.pipelines[p] = dcm dcm.pipelines.add(p) self.clusters.cleanup(p) # Delete the cluster. self.m_delete_cluster.assert_called_once() # Pipeline association is cleaned up. self.assertNotIn(p, self.clusters.pipelines) self.assertNotIn(p, dcm.pipelines) self.assertEqual( options.view_as(FlinkRunnerOptions).flink_master, '[auto]') # The cluster is unknown now. self.assertNotIn(meta, self.clusters.dataproc_cluster_managers) self.assertNotIn(meta.master_url, self.clusters.master_urls) # The cleaned up cluster is also the default cluster. Clean the default. self.assertIsNone(self.clusters.default_cluster_metadata)
def test_not_cleanup_if_multiple_pipelines_share_a_manager(self): meta = ClusterMetadata(project_id='test-project') dcm = self.clusters.create(meta) options = PipelineOptions() options.view_as(FlinkRunnerOptions).flink_master = meta.master_url options2 = PipelineOptions() options2.view_as(FlinkRunnerOptions).flink_master = meta.master_url p = beam.Pipeline(options=options) p2 = beam.Pipeline(options=options2) self.clusters.pipelines[p] = dcm self.clusters.pipelines[p2] = dcm dcm.pipelines.add(p) dcm.pipelines.add(p2) self.clusters.cleanup(p) # No cluster deleted. self.m_delete_cluster.assert_not_called() # Pipeline association of p is cleaned up. self.assertNotIn(p, self.clusters.pipelines) self.assertNotIn(p, dcm.pipelines) self.assertEqual( options.view_as(FlinkRunnerOptions).flink_master, '[auto]') # Pipeline association of p2 still presents. self.assertIn(p2, self.clusters.pipelines) self.assertIn(p2, dcm.pipelines) self.assertEqual( options2.view_as(FlinkRunnerOptions).flink_master, meta.master_url) # The cluster is still known. self.assertIn(meta, self.clusters.dataproc_cluster_managers) self.assertIn(meta.master_url, self.clusters.master_urls) # The default cluster still presents. self.assertIs(meta, self.clusters.default_cluster_metadata)
def configure_for_flink(self, user_pipeline: beam.Pipeline, options: PipelineOptions) -> None: """Configures the pipeline options for running a job with Flink. When running with a FlinkRunner, a job server started from an uber jar (locally built or remotely downloaded) hosting the beam_job_api will communicate with the Flink cluster located at the given flink_master in the pipeline options. """ clusters = ie.current_env().clusters if clusters.pipelines.get(user_pipeline, None): # Noop for a known pipeline using a known Dataproc cluster. return flink_master = options.view_as(FlinkRunnerOptions).flink_master cluster_metadata = clusters.default_cluster_metadata if flink_master == '[auto]': # Try to create/reuse a cluster when no flink_master is given. project_id = options.view_as(GoogleCloudOptions).project region = options.view_as(GoogleCloudOptions).region if project_id: if clusters.default_cluster_metadata: # Reuse the cluster name from default in case of a known cluster. cluster_metadata = ClusterMetadata( project_id=project_id, region=region, cluster_name=clusters.default_cluster_metadata. cluster_name) else: # Generate the metadata with a new unique cluster name. cluster_metadata = ClusterMetadata(project_id=project_id, region=region) # Add additional configurations. self._worker_options_to_cluster_metadata( options, cluster_metadata) # else use the default cluster metadata. elif flink_master in clusters.master_urls: cluster_metadata = clusters.cluster_metadata(flink_master) else: # Noop if a self-hosted Flink is in use. return if not cluster_metadata: return # Not even a default cluster to create/reuse, run Flink locally. dcm = clusters.create(cluster_metadata) # Side effects associated with the user_pipeline. clusters.pipelines[user_pipeline] = dcm dcm.pipelines.add(user_pipeline) self._configure_flink_options(options, clusters.DATAPROC_FLINK_VERSION, dcm.cluster_metadata.master_url)
def test_cluster_metadata_identifies_master_url(self): cid = 'test-url' known_meta = ClusterMetadata(project_id='test-project') _ = DataprocClusterManager(known_meta) self.clusters.master_urls[cid] = known_meta meta = self.clusters.cluster_metadata(cid) self.assertIs(meta, known_meta)
def test_cluster_metadata_identifies_pipeline(self): cid = beam.Pipeline() known_meta = ClusterMetadata(project_id='test-project') dcm = DataprocClusterManager(known_meta) self.clusters.pipelines[cid] = dcm meta = self.clusters.cluster_metadata(cid) self.assertIs(meta, known_meta)
def test_cleanup_by_a_master_url(self): meta = ClusterMetadata(project_id='test-project') _ = self.clusters.create(meta) self.clusters.cleanup(meta.master_url) self.m_delete_cluster.assert_called_once() self.assertNotIn(meta, self.clusters.dataproc_cluster_managers) self.assertNotIn(meta.master_url, self.clusters.master_urls) self.assertIsNone(self.clusters.default_cluster_metadata)
def test_get_staging_location_exception(self, mock_cluster_client): """ Test to catch when an error is raised inside get_staging_location. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region', cluster_name='test-cluster') cluster_manager = DataprocClusterManager(cluster_metadata) with self.assertRaises(MockException): cluster_manager.get_staging_location()
def test_create_but_reuse_a_known_cluster(self): known_meta = ClusterMetadata(project_id='test-project', region='test-region') known_dcm = DataprocClusterManager(known_meta) known_meta.master_url = 'test-url' self.clusters.set_default_cluster(known_meta) self.clusters.dataproc_cluster_managers[known_meta] = known_dcm self.clusters.master_urls[known_meta.master_url] = known_meta # Use an equivalent meta as the identifier to create a cluster. cid_meta = ClusterMetadata(project_id=known_meta.project_id, region=known_meta.region, cluster_name=known_meta.cluster_name) dcm = self.clusters.create(cid_meta) # The known cluster manager is returned. self.assertIs(dcm, known_dcm) # Then use an equivalent master_url as the identifier. cid_master_url = known_meta.master_url dcm = self.clusters.create(cid_master_url) self.assertIs(dcm, known_dcm)
def test_create_a_new_cluster(self): meta = ClusterMetadata(project_id='test-project') _ = self.clusters.create(meta) # Derived fields are populated. self.assertTrue(meta.master_url.startswith('test-url')) self.assertEqual(meta.dashboard, 'test-dashboard') # The cluster is known. self.assertIn(meta, self.clusters.dataproc_cluster_managers) self.assertIn(meta.master_url, self.clusters.master_urls) # The default cluster is updated to the created cluster. self.assertIs(meta, self.clusters.default_cluster_metadata)
def test_create_cluster_region_does_not_exist(self, mock_cluster_client): """ Tests that an exception is thrown when a user specifies a region that does not exist. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue('Invalid region provided' in context_manager.output[0])
def test_create_cluster_default_already_exists(self, mock_cluster_client): """ Tests that no exception is thrown when a cluster already exists, but is using ie.current_env().clusters.default_cluster_name. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='INFO') as context_manager: cluster_manager.create_cluster({}) self.assertTrue('already exists' in context_manager.output[0])
def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when cleanup attempts to delete a cluster that does not exist. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.cleanup) self.assertTrue('Cluster does not exist' in context_manager.output[0])
def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when the exception is not handled by any other case under cleanup. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(MockException, cluster_manager.cleanup) self.assertTrue('Failed to delete cluster' in context_manager.output[0])
def test_cluster_metadata_default_value(self): cid_none = None cid_unknown_p = beam.Pipeline() cid_unknown_master_url = 'test-url' default_meta = ClusterMetadata(project_id='test-project') self.clusters.set_default_cluster(default_meta) self.assertIs(default_meta, self.clusters.cluster_metadata(cid_none)) self.assertIs(default_meta, self.clusters.cluster_metadata(cid_unknown_p)) self.assertIs(default_meta, self.clusters.cluster_metadata(cid_unknown_master_url))
def configure_for_flink(self, user_pipeline: beam.Pipeline, options: PipelineOptions) -> None: """Tunes the pipeline options for the setup of running a job with Flink. """ clusters = ie.current_env().clusters if clusters.pipelines.get(user_pipeline, None): # Noop for a known pipeline using a known Dataproc cluster. return flink_master = options.view_as(FlinkRunnerOptions).flink_master cluster_metadata = clusters.default_cluster_metadata if flink_master == '[auto]': # Try to create/reuse a cluster when no flink_master is given. project_id = options.view_as(GoogleCloudOptions).project region = options.view_as(GoogleCloudOptions).region if project_id: if clusters.default_cluster_metadata: # Reuse the cluster name from default in case of a known cluster. cluster_metadata = ClusterMetadata( project_id=project_id, region=region, cluster_name=clusters.default_cluster_metadata. cluster_name) else: # Generate the metadata with a new unique cluster name. cluster_metadata = ClusterMetadata(project_id=project_id, region=region) # else use the default cluster metadata. elif flink_master in clusters.master_urls: cluster_metadata = clusters.cluster_metadata(flink_master) else: # Noop if a self-hosted Flink is in use. return if not cluster_metadata: return # Not even a default cluster to create/reuse, run Flink locally. dcm = clusters.create(cluster_metadata) # Side effects associated with the user_pipeline. clusters.pipelines[user_pipeline] = dcm dcm.pipelines.add(user_pipeline) flink_options = options.view_as(FlinkRunnerOptions) flink_options.flink_master = dcm.cluster_metadata.master_url flink_options.flink_version = clusters.DATAPROC_FLINK_VERSION
def test_get_staging_location(self, mock_cluster_client, mock_list): """ Test to receive a mock staging location successfully under get_staging_location. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region', cluster_name='test-cluster') cluster_manager = DataprocClusterManager(cluster_metadata) self.assertEqual( cluster_manager.get_staging_location(), 'gs://test-bucket/google-cloud-dataproc-metainfo/')
def test_cleanup_permission_denied(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when a user is trying to delete a project that they have insufficient permissions for. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.cleanup) self.assertTrue( 'Due to insufficient project permissions' in context_manager.output[0])
def test_reuse_a_cluster_for_a_known_pipeline(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') p = beam.Pipeline(runner=runner, options=options) meta = ClusterMetadata(project_id='test-project', region='test-region') dcm = DataprocClusterManager(meta) # Configure the clusters so that the pipeline is known. clusters.pipelines[p] = dcm runner.configure_for_flink(p, options) # A known cluster is reused. tuned_meta = clusters.cluster_metadata(p) self.assertIs(tuned_meta, meta)
def test_describe_everything_when_cluster_identifer_unknown(self): known_meta = ClusterMetadata(project_id='test-project') known_meta2 = ClusterMetadata(project_id='test-project', region='some-other-region') dcm = self.clusters.create(known_meta) dcm2 = self.clusters.create(known_meta2) p = beam.Pipeline() p2 = beam.Pipeline() self.clusters.pipelines[p] = dcm dcm.pipelines.add(p) self.clusters.pipelines[p2] = dcm2 dcm.pipelines.add(p2) cid_pipeline = beam.Pipeline() meta_list = self.clusters.describe(cid_pipeline) self.assertEqual([known_meta, known_meta2], meta_list) cid_master_url = 'some-random-url' meta_list = self.clusters.describe(cid_master_url) self.assertEqual([known_meta, known_meta2], meta_list) cid_meta = ClusterMetadata(project_id='some-random-project') meta_list = self.clusters.describe(cid_meta) self.assertEqual([known_meta, known_meta2], meta_list)
def test_parse_master_url_and_dashboard(self, mock_cluster_details): """ Tests that parse_master_url_and_dashboard properly parses the input string and produces a mock master_url and mock dashboard link. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) line = 'test-line Found Web Interface test-master-url' \ ' of application \'test-app-id\'.\n' master_url, dashboard = cluster_manager.parse_master_url_and_dashboard(line) self.assertEqual('test-master-url', master_url) self.assertEqual( 'test-resource-manager/gateway/default/yarn/proxy/test-app-id/', dashboard)
def test_cleanup_noop_unknown_cluster(self): meta = ClusterMetadata(project_id='test-project') dcm = self.clusters.create(meta) p = beam.Pipeline() self.clusters.pipelines[p] = dcm dcm.pipelines.add(p) cid_pipeline = beam.Pipeline() self.clusters.cleanup(cid_pipeline) self.m_delete_cluster.assert_not_called() cid_master_url = 'some-random-url' self.clusters.cleanup(cid_master_url) self.m_delete_cluster.assert_not_called() cid_meta = ClusterMetadata(project_id='random-project') self.clusters.cleanup(cid_meta) self.m_delete_cluster.assert_not_called() self.assertIn(meta, self.clusters.dataproc_cluster_managers) self.assertIn(meta.master_url, self.clusters.master_urls) self.assertIs(meta, self.clusters.default_cluster_metadata) self.assertIn(p, self.clusters.pipelines) self.assertIn(p, dcm.pipelines)
def test_get_cluster_details_permission_denied(self, mock_cluster_client): """ Tests that an exception is thrown when a user is trying to get information for a project without sufficient permissions to do so. """ cluster_metadata = ClusterMetadata( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs( _LOGGER, level='ERROR') as context_manager, self.assertRaises(ValueError): cluster_manager.get_cluster_details() self.assertTrue( 'Due to insufficient project permissions' in context_manager.output[0])
def test_list_clusters(self): meta = ClusterMetadata(project_id='project') dcm = self.current_env.clusters.create(meta) p = beam.Pipeline() dcm.pipelines.add(p) self.current_env.clusters.pipelines[p] = dcm cluster_id = obfuscate(meta) self.assertEqual( { cluster_id: { 'cluster_name': meta.cluster_name, 'project': meta.project_id, 'region': meta.region, 'master_url': meta.master_url, 'dashboard': meta.dashboard, 'pipelines': [str(id(p)) for p in dcm.pipelines] } }, json.loads(self.current_env.inspector.list_clusters()))