def test_clusters_cleanup_skip_on_duplicate(self, mock_master_url): clusters = ib.Clusters() project = 'test-project' region = 'test-region' p1 = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) p2 = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata_1 = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p1))] = DataprocClusterManager(cluster_metadata_1) clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url' clusters.master_urls_to_pipelines['test_url'].append(str(id(p1))) cluster_metadata_2 = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p1))] = DataprocClusterManager(cluster_metadata_2) clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url' clusters.master_urls_to_pipelines['test_url'].append(str(id(p2))) from apache_beam.runners.interactive.interactive_beam import _LOGGER with self.assertLogs(_LOGGER, level='WARNING') as context_manager: clusters.cleanup(p1) self.assertTrue('skipping deletion' in context_manager.output[0])
def test_get_staging_location_exception(self, mock_cluster_client): """ Test to catch when an error is raised inside get_staging_location. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region', cluster_name='test-cluster') cluster_manager = DataprocClusterManager(cluster_metadata) with self.assertRaises(MockException): cluster_manager.get_staging_location(cluster_metadata)
def test_create_cluster_default_already_exists(self, mock_cluster_client): """ Tests that no exception is thrown when a cluster already exists, but is using ie.current_env().clusters.default_cluster_name. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='INFO') as context_manager: cluster_manager.create_cluster({}) self.assertTrue('already exists' in context_manager.output[0])
def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when the exception is not handled by any other case under cleanup. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(MockException, cluster_manager.cleanup) self.assertTrue('Failed to delete cluster' in context_manager.output[0])
def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup): """ Tests that an exception is thrown when cleanup attempts to delete a cluster that does not exist. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.cleanup) self.assertTrue('Cluster does not exist' in context_manager.output[0])
def test_create_cluster_region_does_not_exist(self, mock_cluster_client): """ Tests that an exception is thrown when a user specifies a region that does not exist. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue('Invalid region provided' in context_manager.output[0])
def test_get_staging_location(self, mock_cluster_client, mock_list): """ Test to receive a mock staging location successfully under get_staging_location. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region', cluster_name='test-cluster') cluster_manager = DataprocClusterManager(cluster_metadata) self.assertEqual( cluster_manager.get_staging_location(cluster_metadata), 'gs://test-bucket/google-cloud-dataproc-metainfo/')
def test_clusters_cleanup_otherwise(self, mock_cleanup): clusters = ie.current_env().clusters project = 'test-project' region = 'test-region' p = beam.Pipeline(options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) clusters.dataproc_cluster_managers[str(id(p))].master_url = 'test_url' clusters.cleanup(p)
def test_create_cluster_permission_denied(self, mock_cluster_client): """ Tests that an exception is thrown when a user is trying to write to a project while having insufficient permissions. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs(_LOGGER, level='ERROR') as context_manager: self.assertRaises(ValueError, cluster_manager.create_cluster, {}) self.assertTrue( 'Due to insufficient project permissions' in context_manager.output[0])
def test_clusters_describe(self): clusters = ib.Clusters() project = 'test-project' region = 'test-region' p = beam.Pipeline( options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) clusters.dataproc_cluster_managers[p] = DataprocClusterManager( cluster_metadata) self.assertEqual('test-project', clusters.describe()[None] \ ['cluster_metadata'].project_id)
def test_cleanup_all_dataproc_clusters(self, mock_cleanup): env = ie.InteractiveEnvironment() project = 'test-project' region = 'test-region' p = beam.Pipeline( options=PipelineOptions( project=project, region=region, )) cluster_metadata = MasterURLIdentifier(project_id=project, region=region) env.clusters.dataproc_cluster_managers[str( id(p))] = DataprocClusterManager(cluster_metadata) env._tracked_user_pipelines.add_user_pipeline(p) env.cleanup() self.assertEqual(env.clusters.dataproc_cluster_managers, {})
def test_get_master_url_and_dashboard(self, mock_parse_method): """ Tests that get_master_url_and_dashboard detect the line containing the unique substring which identifies the location of the master_url and application id of the Flink master. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) cluster_manager._fs = MockFileSystem() master_url, dashboard = cluster_manager.get_master_url_and_dashboard( cluster_metadata, 'test-staging-bucket' ) self.assertEqual(master_url, 'test-master-url') self.assertEqual(dashboard, 'test-dashboard-link')
def test_get_cluster_details_permission_denied(self, mock_cluster_client): """ Tests that an exception is thrown when a user is trying to get information for a project without sufficient permissions to do so. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER with self.assertLogs( _LOGGER, level='ERROR') as context_manager, self.assertRaises(ValueError): cluster_manager.get_cluster_details(cluster_metadata) self.assertTrue( 'Due to insufficient project permissions' in context_manager.output[0])
def test_parse_master_url_and_dashboard(self, mock_cluster_details): """ Tests that parse_master_url_and_dashboard properly parses the input string and produces a mock master_url and mock dashboard link. """ cluster_metadata = MasterURLIdentifier( project_id='test-project', region='test-region') cluster_manager = DataprocClusterManager(cluster_metadata) line = 'test-line Found Web Interface test-master-url' \ ' of application \'test-app-id\'.\n' master_url, dashboard = cluster_manager.parse_master_url_and_dashboard( cluster_metadata, line) self.assertEqual('test-master-url', master_url) self.assertEqual( 'test-resource-manager/gateway/default/yarn/proxy/test-app-id/', dashboard)
def test_get_master_url_no_flink_master_and_master_url_exists(self): from apache_beam.runners.portability.flink_runner import FlinkRunner runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) p = beam.Pipeline(options=PipelineOptions( project='test-project', region='test-region', )) cluster_name = ie.current_env().clusters.default_cluster_name cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region', cluster_name=cluster_name) ie.current_env().clusters.master_urls['test-url'] = cluster_metadata ie.current_env( ).clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard' flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p) self.assertEqual( ie.current_env().clusters.describe(p) ['cluster_metadata'].project_id, 'test-project') self.assertEqual(flink_master, ie.current_env().clusters.describe(p)['master_url'])
def _create_dataproc_cluster_if_applicable(self, user_pipeline): """ Creates a Dataproc cluster if the provided user_pipeline is running FlinkRunner and no flink_master_url was provided as an option. A cluster is not created when a flink_master_url is detected. Example pipeline options to enable automatic Dataproc cluster creation: options = PipelineOptions([ '--runner=FlinkRunner', '--project=my-project', '--region=my-region', '--environment_type=DOCKER' ]) Example pipeline options to skip automatic Dataproc cluster creation: options = PipelineOptions([ '--runner=FlinkRunner', '--flink_master=example.internal:41979', '--environment_type=DOCKER' ]) """ from apache_beam.runners.portability.flink_runner import FlinkRunner from apache_beam.options.pipeline_options import FlinkRunnerOptions flink_master = user_pipeline.options.view_as( FlinkRunnerOptions).flink_master clusters = ie.current_env().clusters # Only consider this logic when both below 2 conditions apply. if isinstance(self._underlying_runner, FlinkRunner) and clusters.dataproc_cluster_managers.get( str(id(user_pipeline)), None) is None: if flink_master == '[auto]': # The above condition is True when the user has not provided a # flink_master. if ie.current_env()._is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) project_id = ( user_pipeline.options.view_as(GoogleCloudOptions).project) region = ( user_pipeline.options.view_as(GoogleCloudOptions).region) cluster_name = ie.current_env().clusters.default_cluster_name cluster_metadata = MasterURLIdentifier( project_id=project_id, region=region, cluster_name=cluster_name) else: cluster_metadata = clusters.master_urls.inverse.get( flink_master, None) # else noop, no need to log anything because we allow a master_url # (not managed by us) provided by the user. if cluster_metadata: # create the cluster_manager and populate dicts in the clusters # instance if the pipeline is not already mapped to an existing # cluster_manager. cluster_manager = DataprocClusterManager(cluster_metadata) cluster_manager.create_flink_cluster() clusters.master_urls[ cluster_manager.master_url] = cluster_metadata clusters.dataproc_cluster_managers[str( id(user_pipeline))] = cluster_manager clusters.master_urls_to_pipelines[ cluster_manager.master_url].append(str(id(user_pipeline)))