Beispiel #1
0
    def test_set_default_cluster(self):
        clusters = ie.current_env().clusters
        master_url = 'test-url'
        cluster_name = 'test-cluster'
        project = 'test-project'
        region = 'test-region'
        pipelines = ['pid']
        dashboard = 'test-dashboard'

        cluster_id = obfuscate(project, region, cluster_name)
        ie.current_env().inspector._clusters = {
            cluster_id: {
                'cluster_name': cluster_name,
                'project': project,
                'region': region,
                'master_url': master_url,
                'dashboard': dashboard,
                'pipelines': pipelines
            }
        }
        clusters.master_urls[master_url] = MasterURLIdentifier(
            project, region, cluster_name)
        clusters.set_default_cluster(
            ie.current_env().inspector.get_cluster_master_url(cluster_id))
        self.assertEqual(MasterURLIdentifier(project, region, cluster_name),
                         clusters.default_cluster_metadata)
Beispiel #2
0
 def test_clusters_cleanup_skip_on_duplicate(self, mock_master_url):
     clusters = ib.Clusters()
     project = 'test-project'
     region = 'test-region'
     p1 = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     p2 = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata_1 = MasterURLIdentifier(project_id=project,
                                              region=region)
     clusters.dataproc_cluster_managers[str(
         id(p1))] = DataprocClusterManager(cluster_metadata_1)
     clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url'
     clusters.master_urls_to_pipelines['test_url'].append(str(id(p1)))
     cluster_metadata_2 = MasterURLIdentifier(project_id=project,
                                              region=region)
     clusters.dataproc_cluster_managers[str(
         id(p1))] = DataprocClusterManager(cluster_metadata_2)
     clusters.dataproc_cluster_managers[str(id(p1))].master_url = 'test_url'
     clusters.master_urls_to_pipelines['test_url'].append(str(id(p2)))
     from apache_beam.runners.interactive.interactive_beam import _LOGGER
     with self.assertLogs(_LOGGER, level='WARNING') as context_manager:
         clusters.cleanup(p1)
         self.assertTrue('skipping deletion' in context_manager.output[0])
Beispiel #3
0
    def test_delete_cluster(self):
        clusters = ie.current_env().clusters

        class MockClusterManager:
            master_url = 'test-url'

            def cleanup(self):
                pass

        master_url = 'test-url'
        cluster_name = 'test-cluster'
        project = 'test-project'
        region = 'test-region'
        metadata = MasterURLIdentifier(project, region, cluster_name)

        p = beam.Pipeline(ir.InteractiveRunner())
        ie.current_env()._tracked_user_pipelines.add_user_pipeline(p)
        clusters.master_urls[master_url] = metadata
        clusters.master_urls_to_dashboards[master_url] = 'test-dashboard'
        clusters.dataproc_cluster_managers[str(id(p))] = MockClusterManager()
        clusters.master_urls_to_pipelines[master_url] = [str(id(p))]

        cluster_id = obfuscate(project, region, cluster_name)
        ie.current_env().inspector._clusters[cluster_id] = {
            'master_url': master_url,
            'pipelines': [str(id(p))]
        }
        clusters.delete_cluster(
            ie.current_env().inspector.get_cluster_master_url(cluster_id))
        self.assertEqual(clusters.master_urls, {})
        self.assertEqual(clusters.master_urls_to_pipelines, {})
Beispiel #4
0
 def test_list_clusters(self):
     master_url = 'test-url'
     cluster_name = 'test-cluster'
     project = 'test-project'
     region = 'test-region'
     pipelines = ['pid']
     dashboard = 'test-dashboard'
     ie.current_env(
     ).clusters.master_urls[master_url] = MasterURLIdentifier(
         project, region, cluster_name)
     ie.current_env(
     ).clusters.master_urls_to_pipelines[master_url] = pipelines
     ie.current_env(
     ).clusters.master_urls_to_dashboards[master_url] = dashboard
     ins = inspector.InteractiveEnvironmentInspector()
     cluster_id = obfuscate(project, region, cluster_name)
     self.assertEqual(
         {
             cluster_id: {
                 'cluster_name': cluster_name,
                 'project': project,
                 'region': region,
                 'master_url': master_url,
                 'dashboard': dashboard,
                 'pipelines': pipelines
             }
         }, json.loads(ins.list_clusters()))
 def test_get_staging_location_exception(self, mock_cluster_client):
     """
 Test to catch when an error is raised inside get_staging_location.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region',
                                            cluster_name='test-cluster')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     with self.assertRaises(MockException):
         cluster_manager.get_staging_location(cluster_metadata)
 def test_get_staging_location(self, mock_cluster_client, mock_list):
     """
 Test to receive a mock staging location successfully under
 get_staging_location.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region',
                                            cluster_name='test-cluster')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     self.assertEqual(
         cluster_manager.get_staging_location(cluster_metadata),
         'gs://test-bucket/google-cloud-dataproc-metainfo/')
 def test_create_cluster_default_already_exists(self, mock_cluster_client):
     """
 Tests that no exception is thrown when a cluster already exists,
 but is using ie.current_env().clusters.default_cluster_name.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(_LOGGER, level='INFO') as context_manager:
         cluster_manager.create_cluster({})
         self.assertTrue('already exists' in context_manager.output[0])
 def test_create_cluster_permission_denied(self, mock_cluster_client):
     """
 Tests that an exception is thrown when a user is trying to write to
 a project while having insufficient permissions.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
         self.assertRaises(ValueError, cluster_manager.create_cluster, {})
         self.assertTrue('Due to insufficient project permissions' in
                         context_manager.output[0])
 def test_create_cluster_region_does_not_exist(self, mock_cluster_client):
     """
 Tests that an exception is thrown when a user specifies a region
 that does not exist.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
         self.assertRaises(ValueError, cluster_manager.create_cluster, {})
         self.assertTrue(
             'Invalid region provided' in context_manager.output[0])
 def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup):
     """
 Tests that an exception is thrown when cleanup attempts to delete
 a cluster that does not exist.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
         self.assertRaises(ValueError, cluster_manager.cleanup)
         self.assertTrue(
             'Cluster does not exist' in context_manager.output[0])
 def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup):
     """
 Tests that an exception is thrown when the exception is not handled by
 any other case under cleanup.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
         self.assertRaises(MockException, cluster_manager.cleanup)
         self.assertTrue(
             'Failed to delete cluster' in context_manager.output[0])
 def test_get_master_url_and_dashboard(self, mock_parse_method):
     """
 Tests that get_master_url_and_dashboard detect the line containing the
 unique substring which identifies the location of the master_url and
 application id of the Flink master.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     cluster_manager._fs = MockFileSystem()
     master_url, dashboard = cluster_manager.get_master_url_and_dashboard(
         cluster_metadata, 'test-staging-bucket')
     self.assertEqual(master_url, 'test-master-url')
     self.assertEqual(dashboard, 'test-dashboard-link')
Beispiel #13
0
 def test_clusters_cleanup_otherwise(self, mock_cleanup):
     clusters = ie.current_env().clusters
     project = 'test-project'
     region = 'test-region'
     p = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata = MasterURLIdentifier(project_id=project,
                                            region=region)
     clusters.dataproc_cluster_managers[str(
         id(p))] = DataprocClusterManager(cluster_metadata)
     clusters.dataproc_cluster_managers[str(id(p))].master_url = 'test_url'
     clusters.cleanup(p)
 def test_get_cluster_details_permission_denied(self, mock_cluster_client):
     """
 Tests that an exception is thrown when a user is trying to get information
 for a project without sufficient permissions to do so.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(
             _LOGGER, level='ERROR') as context_manager, self.assertRaises(
                 ValueError):
         cluster_manager.get_cluster_details(cluster_metadata)
         self.assertTrue('Due to insufficient project permissions' in
                         context_manager.output[0])
Beispiel #15
0
 def test_cleanup_all_dataproc_clusters(self, mock_cleanup):
     env = ie.InteractiveEnvironment()
     project = 'test-project'
     region = 'test-region'
     p = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata = MasterURLIdentifier(project_id=project,
                                            region=region)
     env.clusters.dataproc_cluster_managers[str(
         id(p))] = DataprocClusterManager(cluster_metadata)
     env._tracked_user_pipelines.add_user_pipeline(p)
     env.cleanup()
     self.assertEqual(env.clusters.dataproc_cluster_managers, {})
Beispiel #16
0
 def test_clusters_describe(self):
     clusters = ib.Clusters()
     project = 'test-project'
     region = 'test-region'
     p = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata = MasterURLIdentifier(project_id=project,
                                            region=region)
     clusters.dataproc_cluster_managers[str(
         id(p))] = DataprocClusterManager(cluster_metadata)
     self.assertEqual(
         'test-project',
         clusters.describe()[str(id(p))]['cluster_metadata'].project_id)
 def test_parse_master_url_and_dashboard(self, mock_cluster_details):
     """
 Tests that parse_master_url_and_dashboard properly parses the input
 string and produces a mock master_url and mock dashboard link.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     line = 'test-line Found Web Interface test-master-url' \
     ' of application \'test-app-id\'.\n'
     master_url, dashboard = cluster_manager.parse_master_url_and_dashboard(
         cluster_metadata, line)
     self.assertEqual('test-master-url', master_url)
     self.assertEqual(
         'test-resource-manager/gateway/default/yarn/proxy/test-app-id/',
         dashboard)
Beispiel #18
0
    def test_get_master_url_no_flink_master_and_master_url_exists(self, m_env):
        clusters = ib.Clusters()
        m_env().clusters = clusters

        from apache_beam.runners.portability.flink_runner import FlinkRunner
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        p = beam.Pipeline(options=PipelineOptions(
            project='test-project',
            region='test-region',
        ))
        cluster_name = clusters.default_cluster_name
        cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                               region='test-region',
                                               cluster_name=cluster_name)
        clusters.master_urls['test-url'] = cluster_metadata
        clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard'
        flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p)
        self.assertEqual(
            clusters.describe(p)['cluster_metadata'].project_id,
            'test-project')
        self.assertEqual(flink_master, clusters.describe(p)['master_url'])
Beispiel #19
0
    def _get_dataproc_cluster_master_url_if_applicable(
            self, user_pipeline: beam.Pipeline) -> str:
        """ Creates a Dataproc cluster if the provided user_pipeline is running
    FlinkRunner and no flink_master_url was provided as an option. A cluster
    is not created when a flink_master_url is detected.

    Example pipeline options to enable automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--project=my-project',
      '--region=my-region',
      '--environment_type=DOCKER'
      ])

    Example pipeline options to skip automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--flink_master=example.internal:41979',
      '--environment_type=DOCKER'
      ])
    """
        from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager
        from apache_beam.runners.portability.flink_runner import FlinkRunner
        flink_master = user_pipeline.options.view_as(
            FlinkRunnerOptions).flink_master
        clusters = ie.current_env().clusters
        # Only consider this logic when both below 2 conditions apply.
        if isinstance(self._underlying_runner,
                      FlinkRunner) and clusters.dataproc_cluster_managers.get(
                          str(id(user_pipeline)), None) is None:
            if flink_master == '[auto]':
                # The above condition is True when the user has not provided a
                # flink_master.
                if ie.current_env()._is_in_ipython:
                    warnings.filterwarnings(
                        'ignore',
                        'options is deprecated since First stable release. References to '
                        '<pipeline>.options will not be supported',
                        category=DeprecationWarning)
                project_id = (
                    user_pipeline.options.view_as(GoogleCloudOptions).project)
                region = (
                    user_pipeline.options.view_as(GoogleCloudOptions).region)
                if not project_id:
                    # When a Google Cloud project is not specified, we try to set the
                    # cluster_metadata to be the default value set from the
                    # 'Manage Clusters' JupyterLab extension. If a value has not been
                    # specified, this value defaults to None.
                    cluster_metadata = ie.current_env(
                    ).clusters.default_cluster_metadata
                else:
                    cluster_name = ie.current_env(
                    ).clusters.default_cluster_name
                    cluster_metadata = MasterURLIdentifier(
                        project_id=project_id,
                        region=region,
                        cluster_name=cluster_name)
            else:
                cluster_metadata = clusters.master_urls.get(flink_master, None)
            # else noop, no need to log anything because we allow a master_url
            # (not managed by us) provided by the user.
            if cluster_metadata:
                # create the cluster_manager and populate dicts in the clusters
                # instance if the pipeline is not already mapped to an existing
                # cluster_manager.
                cluster_manager = DataprocClusterManager(cluster_metadata)
                cluster_manager.create_flink_cluster()
                clusters.master_urls[
                    cluster_manager.master_url] = cluster_metadata
                clusters.dataproc_cluster_managers[str(
                    id(user_pipeline))] = cluster_manager
                clusters.master_urls_to_pipelines[
                    cluster_manager.master_url].append(str(id(user_pipeline)))
                clusters.master_urls_to_dashboards[
                    cluster_manager.master_url] = cluster_manager.dashboard
                return cluster_manager.master_url