Example #1
0
 def test_clusters_cleanup_skip_on_duplicate(self, mock_master_url):
     clusters = ib.Clusters()
     project = 'test-project'
     region = 'test-region'
     p1 = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     p2 = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata_1 = MasterURLIdentifier(project_id=project,
                                              region=region)
     clusters.dataproc_cluster_managers[str(
         id(p1))] = DataprocClusterManager(cluster_metadata_1)
     clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url'
     clusters.master_urls_to_pipelines['test_url'].append(str(id(p1)))
     cluster_metadata_2 = MasterURLIdentifier(project_id=project,
                                              region=region)
     clusters.dataproc_cluster_managers[str(
         id(p1))] = DataprocClusterManager(cluster_metadata_2)
     clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url'
     clusters.master_urls_to_pipelines['test_url'].append(str(id(p2)))
     from apache_beam.runners.interactive.interactive_beam import _LOGGER
     with self.assertLogs(_LOGGER, level='WARNING') as context_manager:
         clusters.cleanup(p1)
         self.assertTrue('skipping deletion' in context_manager.output[0])
 def test_get_staging_location_exception(self, mock_cluster_client):
   """
   Test to catch when an error is raised inside get_staging_location.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project',
       region='test-region',
       cluster_name='test-cluster')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   with self.assertRaises(MockException):
     cluster_manager.get_staging_location(cluster_metadata)
 def test_create_cluster_default_already_exists(self, mock_cluster_client):
   """
   Tests that no exception is thrown when a cluster already exists,
   but is using ie.current_env().clusters.default_cluster_name.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='INFO') as context_manager:
     cluster_manager.create_cluster({})
     self.assertTrue('already exists' in context_manager.output[0])
 def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when the exception is not handled by
   any other case under cleanup.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(MockException, cluster_manager.cleanup)
     self.assertTrue('Failed to delete cluster' in context_manager.output[0])
 def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when cleanup attempts to delete
   a cluster that does not exist.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.cleanup)
     self.assertTrue('Cluster does not exist' in context_manager.output[0])
 def test_create_cluster_region_does_not_exist(self, mock_cluster_client):
   """
   Tests that an exception is thrown when a user specifies a region
   that does not exist.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.create_cluster, {})
     self.assertTrue('Invalid region provided' in context_manager.output[0])
 def test_get_staging_location(self, mock_cluster_client, mock_list):
   """
   Test to receive a mock staging location successfully under
   get_staging_location.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project',
       region='test-region',
       cluster_name='test-cluster')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   self.assertEqual(
       cluster_manager.get_staging_location(cluster_metadata),
       'gs://test-bucket/google-cloud-dataproc-metainfo/')
Example #8
0
 def test_clusters_cleanup_otherwise(self, mock_cleanup):
     clusters = ie.current_env().clusters
     project = 'test-project'
     region = 'test-region'
     p = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata = MasterURLIdentifier(project_id=project,
                                            region=region)
     clusters.dataproc_cluster_managers[str(
         id(p))] = DataprocClusterManager(cluster_metadata)
     clusters.dataproc_cluster_managers[str(id(p))].master_url = 'test_url'
     clusters.cleanup(p)
 def test_create_cluster_permission_denied(self, mock_cluster_client):
   """
   Tests that an exception is thrown when a user is trying to write to
   a project while having insufficient permissions.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.create_cluster, {})
     self.assertTrue(
         'Due to insufficient project permissions' in
         context_manager.output[0])
Example #10
0
 def test_clusters_describe(self):
   clusters = ib.Clusters()
   project = 'test-project'
   region = 'test-region'
   p = beam.Pipeline(
       options=PipelineOptions(
           project=project,
           region=region,
       ))
   cluster_metadata = MasterURLIdentifier(project_id=project, region=region)
   clusters.dataproc_cluster_managers[p] = DataprocClusterManager(
       cluster_metadata)
   self.assertEqual('test-project', clusters.describe()[None] \
   ['cluster_metadata'].project_id)
Example #11
0
 def test_cleanup_all_dataproc_clusters(self, mock_cleanup):
   env = ie.InteractiveEnvironment()
   project = 'test-project'
   region = 'test-region'
   p = beam.Pipeline(
       options=PipelineOptions(
           project=project,
           region=region,
       ))
   cluster_metadata = MasterURLIdentifier(project_id=project, region=region)
   env.clusters.dataproc_cluster_managers[str(
       id(p))] = DataprocClusterManager(cluster_metadata)
   env._tracked_user_pipelines.add_user_pipeline(p)
   env.cleanup()
   self.assertEqual(env.clusters.dataproc_cluster_managers, {})
Example #12
0
 def test_get_master_url_and_dashboard(self, mock_parse_method):
   """
   Tests that get_master_url_and_dashboard detect the line containing the
   unique substring which identifies the location of the master_url and
   application id of the Flink master.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   cluster_manager._fs = MockFileSystem()
   master_url, dashboard = cluster_manager.get_master_url_and_dashboard(
       cluster_metadata,
       'test-staging-bucket'
   )
   self.assertEqual(master_url, 'test-master-url')
   self.assertEqual(dashboard, 'test-dashboard-link')
Example #13
0
 def test_get_cluster_details_permission_denied(self, mock_cluster_client):
   """
   Tests that an exception is thrown when a user is trying to get information
   for a project without sufficient permissions to do so.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(
       _LOGGER,
       level='ERROR') as context_manager, self.assertRaises(ValueError):
     cluster_manager.get_cluster_details(cluster_metadata)
     self.assertTrue(
         'Due to insufficient project permissions' in
         context_manager.output[0])
Example #14
0
 def test_parse_master_url_and_dashboard(self, mock_cluster_details):
   """
   Tests that parse_master_url_and_dashboard properly parses the input
   string and produces a mock master_url and mock dashboard link.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   line = 'test-line Found Web Interface test-master-url' \
   ' of application \'test-app-id\'.\n'
   master_url, dashboard = cluster_manager.parse_master_url_and_dashboard(
     cluster_metadata, line)
   self.assertEqual('test-master-url', master_url)
   self.assertEqual(
       'test-resource-manager/gateway/default/yarn/proxy/test-app-id/',
       dashboard)
 def test_get_master_url_no_flink_master_and_master_url_exists(self):
     from apache_beam.runners.portability.flink_runner import FlinkRunner
     runner = interactive_runner.InteractiveRunner(
         underlying_runner=FlinkRunner())
     p = beam.Pipeline(options=PipelineOptions(
         project='test-project',
         region='test-region',
     ))
     cluster_name = ie.current_env().clusters.default_cluster_name
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region',
                                            cluster_name=cluster_name)
     ie.current_env().clusters.master_urls['test-url'] = cluster_metadata
     ie.current_env(
     ).clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard'
     flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p)
     self.assertEqual(
         ie.current_env().clusters.describe(p)
         ['cluster_metadata'].project_id, 'test-project')
     self.assertEqual(flink_master,
                      ie.current_env().clusters.describe(p)['master_url'])
Example #16
0
    def _create_dataproc_cluster_if_applicable(self, user_pipeline):
        """ Creates a Dataproc cluster if the provided user_pipeline is running
    FlinkRunner and no flink_master_url was provided as an option. A cluster
    is not created when a flink_master_url is detected.

    Example pipeline options to enable automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--project=my-project',
      '--region=my-region',
      '--environment_type=DOCKER'
      ])

    Example pipeline options to skip automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--flink_master=example.internal:41979',
      '--environment_type=DOCKER'
      ])
    """
        from apache_beam.runners.portability.flink_runner import FlinkRunner
        from apache_beam.options.pipeline_options import FlinkRunnerOptions
        flink_master = user_pipeline.options.view_as(
            FlinkRunnerOptions).flink_master
        clusters = ie.current_env().clusters
        # Only consider this logic when both below 2 conditions apply.
        if isinstance(self._underlying_runner,
                      FlinkRunner) and clusters.dataproc_cluster_managers.get(
                          str(id(user_pipeline)), None) is None:
            if flink_master == '[auto]':
                # The above condition is True when the user has not provided a
                # flink_master.
                if ie.current_env()._is_in_ipython:
                    warnings.filterwarnings(
                        'ignore',
                        'options is deprecated since First stable release. References to '
                        '<pipeline>.options will not be supported',
                        category=DeprecationWarning)
                project_id = (
                    user_pipeline.options.view_as(GoogleCloudOptions).project)
                region = (
                    user_pipeline.options.view_as(GoogleCloudOptions).region)
                cluster_name = ie.current_env().clusters.default_cluster_name
                cluster_metadata = MasterURLIdentifier(
                    project_id=project_id,
                    region=region,
                    cluster_name=cluster_name)
            else:
                cluster_metadata = clusters.master_urls.inverse.get(
                    flink_master, None)
            # else noop, no need to log anything because we allow a master_url
            # (not managed by us) provided by the user.
            if cluster_metadata:
                # create the cluster_manager and populate dicts in the clusters
                # instance if the pipeline is not already mapped to an existing
                # cluster_manager.
                cluster_manager = DataprocClusterManager(cluster_metadata)
                cluster_manager.create_flink_cluster()
                clusters.master_urls[
                    cluster_manager.master_url] = cluster_metadata
                clusters.dataproc_cluster_managers[str(
                    id(user_pipeline))] = cluster_manager
                clusters.master_urls_to_pipelines[
                    cluster_manager.master_url].append(str(id(user_pipeline)))