Esempio n. 1
0
    def test_reuse_default_cluster_if_not_configured(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions()
        # Pipeline is not configured to run on Cloud.
        p = beam.Pipeline(runner=runner, options=options)
        meta = ClusterMetadata(project_id='test-project', region='test-region')
        meta.master_url = 'test-url'
        meta.dashboard = 'test-dashboard'
        dcm = DataprocClusterManager(meta)
        # Configure the clusters so that a default cluster is known.
        clusters.dataproc_cluster_managers[meta] = dcm
        clusters.set_default_cluster(meta)
        runner.configure_for_flink(p, options)

        # The default cluster is used.
        tuned_meta = clusters.cluster_metadata(p)
        self.assertIs(tuned_meta, clusters.default_cluster_metadata)
        # The pipeline is known.
        self.assertIn(p, clusters.pipelines)
        registered_dcm = clusters.pipelines[p]
        self.assertIn(p, registered_dcm.pipelines)
        # The pipeline options is tuned for execution on the cluster.
        flink_options = options.view_as(FlinkRunnerOptions)
        self.assertEqual(flink_options.flink_master, tuned_meta.master_url)
        self.assertEqual(flink_options.flink_version,
                         clusters.DATAPROC_FLINK_VERSION)
Esempio n. 2
0
    def test_describe_by_cluster_identifier(self):
        known_meta = ClusterMetadata(project_id='test-project')
        known_meta2 = ClusterMetadata(project_id='test-project',
                                      region='some-other-region')
        dcm = self.clusters.create(known_meta)
        dcm2 = self.clusters.create(known_meta2)
        p = beam.Pipeline()
        p2 = beam.Pipeline()
        self.clusters.pipelines[p] = dcm
        dcm.pipelines.add(p)
        self.clusters.pipelines[p2] = dcm2
        dcm.pipelines.add(p2)

        cid_pipeline = p
        meta = self.clusters.describe(cid_pipeline)
        self.assertIs(meta, known_meta)

        cid_master_url = known_meta.master_url
        meta = self.clusters.describe(cid_master_url)
        self.assertIs(meta, known_meta)

        cid_meta = ClusterMetadata(project_id=known_meta.project_id,
                                   region=known_meta.region,
                                   cluster_name=known_meta.cluster_name)
        meta = self.clusters.describe(cid_meta)
        self.assertIs(meta, known_meta)
Esempio n. 3
0
    def test_describe_everything(self):
        meta = ClusterMetadata(project_id='test-project')
        meta2 = ClusterMetadata(project_id='test-project',
                                region='some-other-region')
        _ = self.clusters.create(meta)
        _ = self.clusters.create(meta2)

        meta_list = self.clusters.describe()
        self.assertEqual([meta, meta2], meta_list)
Esempio n. 4
0
 def _worker_options_to_cluster_metadata(self, options: PipelineOptions,
                                         cluster_metadata: ClusterMetadata):
     worker_options = options.view_as(WorkerOptions)
     if worker_options.subnetwork:
         cluster_metadata.subnetwork = worker_options.subnetwork
     if worker_options.num_workers:
         cluster_metadata.num_workers = worker_options.num_workers
     if worker_options.machine_type:
         cluster_metadata.machine_type = worker_options.machine_type
Esempio n. 5
0
    def test_force_cleanup_everything(self):
        meta = ClusterMetadata(project_id='test-project')
        meta2 = ClusterMetadata(project_id='test-project-2')
        _ = self.clusters.create(meta)
        _ = self.clusters.create(meta2)

        self.clusters.cleanup(force=True)
        self.assertEqual(self.m_delete_cluster.call_count, 2)
        self.assertNotIn(meta, self.clusters.dataproc_cluster_managers)
        self.assertNotIn(meta2, self.clusters.dataproc_cluster_managers)
        self.assertIsNone(self.clusters.default_cluster_metadata)
Esempio n. 6
0
    def test_cleanup_by_meta(self):
        known_meta = ClusterMetadata(project_id='test-project',
                                     region='test-region')
        _ = self.clusters.create(known_meta)

        meta = ClusterMetadata(project_id=known_meta.project_id,
                               region=known_meta.region,
                               cluster_name=known_meta.cluster_name)
        self.clusters.cleanup(meta)
        self.m_delete_cluster.assert_called_once()
        self.assertNotIn(known_meta, self.clusters.dataproc_cluster_managers)
        self.assertNotIn(known_meta.master_url, self.clusters.master_urls)
        self.assertIsNone(self.clusters.default_cluster_metadata)
 def test_get_master_url_and_dashboard(self, mock_parse_method):
   """
   Tests that get_master_url_and_dashboard detect the line containing the
   unique substring which identifies the location of the master_url and
   application id of the Flink master.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   cluster_manager._fs = MockFileSystem()
   cluster_metadata._staging_directory = 'test-staging-bucket'
   master_url, dashboard = cluster_manager.get_master_url_and_dashboard()
   self.assertEqual(master_url, 'test-master-url')
   self.assertEqual(dashboard, 'test-dashboard-link')
Esempio n. 8
0
    def test_cleanup_by_a_pipeline(self):
        meta = ClusterMetadata(project_id='test-project')
        dcm = self.clusters.create(meta)

        # Set up the association between a pipeline and a cluster.
        # In real code, it's set by the runner the 1st time a pipeline is executed.
        options = PipelineOptions()
        options.view_as(FlinkRunnerOptions).flink_master = meta.master_url
        p = beam.Pipeline(options=options)
        self.clusters.pipelines[p] = dcm
        dcm.pipelines.add(p)

        self.clusters.cleanup(p)
        # Delete the cluster.
        self.m_delete_cluster.assert_called_once()
        # Pipeline association is cleaned up.
        self.assertNotIn(p, self.clusters.pipelines)
        self.assertNotIn(p, dcm.pipelines)
        self.assertEqual(
            options.view_as(FlinkRunnerOptions).flink_master, '[auto]')
        # The cluster is unknown now.
        self.assertNotIn(meta, self.clusters.dataproc_cluster_managers)
        self.assertNotIn(meta.master_url, self.clusters.master_urls)
        # The cleaned up cluster is also the default cluster. Clean the default.
        self.assertIsNone(self.clusters.default_cluster_metadata)
Esempio n. 9
0
    def test_not_cleanup_if_multiple_pipelines_share_a_manager(self):
        meta = ClusterMetadata(project_id='test-project')
        dcm = self.clusters.create(meta)

        options = PipelineOptions()
        options.view_as(FlinkRunnerOptions).flink_master = meta.master_url
        options2 = PipelineOptions()
        options2.view_as(FlinkRunnerOptions).flink_master = meta.master_url
        p = beam.Pipeline(options=options)
        p2 = beam.Pipeline(options=options2)
        self.clusters.pipelines[p] = dcm
        self.clusters.pipelines[p2] = dcm
        dcm.pipelines.add(p)
        dcm.pipelines.add(p2)

        self.clusters.cleanup(p)
        # No cluster deleted.
        self.m_delete_cluster.assert_not_called()
        # Pipeline association of p is cleaned up.
        self.assertNotIn(p, self.clusters.pipelines)
        self.assertNotIn(p, dcm.pipelines)
        self.assertEqual(
            options.view_as(FlinkRunnerOptions).flink_master, '[auto]')
        # Pipeline association of p2 still presents.
        self.assertIn(p2, self.clusters.pipelines)
        self.assertIn(p2, dcm.pipelines)
        self.assertEqual(
            options2.view_as(FlinkRunnerOptions).flink_master, meta.master_url)
        # The cluster is still known.
        self.assertIn(meta, self.clusters.dataproc_cluster_managers)
        self.assertIn(meta.master_url, self.clusters.master_urls)
        # The default cluster still presents.
        self.assertIs(meta, self.clusters.default_cluster_metadata)
Esempio n. 10
0
    def configure_for_flink(self, user_pipeline: beam.Pipeline,
                            options: PipelineOptions) -> None:
        """Configures the pipeline options for running a job with Flink.

    When running with a FlinkRunner, a job server started from an uber jar
    (locally built or remotely downloaded) hosting the beam_job_api will
    communicate with the Flink cluster located at the given flink_master in the
    pipeline options.
    """
        clusters = ie.current_env().clusters
        if clusters.pipelines.get(user_pipeline, None):
            # Noop for a known pipeline using a known Dataproc cluster.
            return
        flink_master = options.view_as(FlinkRunnerOptions).flink_master
        cluster_metadata = clusters.default_cluster_metadata
        if flink_master == '[auto]':
            # Try to create/reuse a cluster when no flink_master is given.
            project_id = options.view_as(GoogleCloudOptions).project
            region = options.view_as(GoogleCloudOptions).region
            if project_id:
                if clusters.default_cluster_metadata:
                    # Reuse the cluster name from default in case of a known cluster.
                    cluster_metadata = ClusterMetadata(
                        project_id=project_id,
                        region=region,
                        cluster_name=clusters.default_cluster_metadata.
                        cluster_name)
                else:
                    # Generate the metadata with a new unique cluster name.
                    cluster_metadata = ClusterMetadata(project_id=project_id,
                                                       region=region)
                # Add additional configurations.
                self._worker_options_to_cluster_metadata(
                    options, cluster_metadata)
            # else use the default cluster metadata.
        elif flink_master in clusters.master_urls:
            cluster_metadata = clusters.cluster_metadata(flink_master)
        else:  # Noop if a self-hosted Flink is in use.
            return
        if not cluster_metadata:
            return  # Not even a default cluster to create/reuse, run Flink locally.
        dcm = clusters.create(cluster_metadata)
        # Side effects associated with the user_pipeline.
        clusters.pipelines[user_pipeline] = dcm
        dcm.pipelines.add(user_pipeline)
        self._configure_flink_options(options, clusters.DATAPROC_FLINK_VERSION,
                                      dcm.cluster_metadata.master_url)
Esempio n. 11
0
    def test_cluster_metadata_identifies_master_url(self):
        cid = 'test-url'
        known_meta = ClusterMetadata(project_id='test-project')
        _ = DataprocClusterManager(known_meta)
        self.clusters.master_urls[cid] = known_meta

        meta = self.clusters.cluster_metadata(cid)
        self.assertIs(meta, known_meta)
Esempio n. 12
0
    def test_cluster_metadata_identifies_pipeline(self):
        cid = beam.Pipeline()
        known_meta = ClusterMetadata(project_id='test-project')
        dcm = DataprocClusterManager(known_meta)
        self.clusters.pipelines[cid] = dcm

        meta = self.clusters.cluster_metadata(cid)
        self.assertIs(meta, known_meta)
Esempio n. 13
0
    def test_cleanup_by_a_master_url(self):
        meta = ClusterMetadata(project_id='test-project')
        _ = self.clusters.create(meta)

        self.clusters.cleanup(meta.master_url)
        self.m_delete_cluster.assert_called_once()
        self.assertNotIn(meta, self.clusters.dataproc_cluster_managers)
        self.assertNotIn(meta.master_url, self.clusters.master_urls)
        self.assertIsNone(self.clusters.default_cluster_metadata)
 def test_get_staging_location_exception(self, mock_cluster_client):
   """
   Test to catch when an error is raised inside get_staging_location.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project',
       region='test-region',
       cluster_name='test-cluster')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   with self.assertRaises(MockException):
     cluster_manager.get_staging_location()
Esempio n. 15
0
    def test_create_but_reuse_a_known_cluster(self):
        known_meta = ClusterMetadata(project_id='test-project',
                                     region='test-region')
        known_dcm = DataprocClusterManager(known_meta)
        known_meta.master_url = 'test-url'
        self.clusters.set_default_cluster(known_meta)
        self.clusters.dataproc_cluster_managers[known_meta] = known_dcm
        self.clusters.master_urls[known_meta.master_url] = known_meta

        # Use an equivalent meta as the identifier to create a cluster.
        cid_meta = ClusterMetadata(project_id=known_meta.project_id,
                                   region=known_meta.region,
                                   cluster_name=known_meta.cluster_name)
        dcm = self.clusters.create(cid_meta)
        # The known cluster manager is returned.
        self.assertIs(dcm, known_dcm)

        # Then use an equivalent master_url as the identifier.
        cid_master_url = known_meta.master_url
        dcm = self.clusters.create(cid_master_url)
        self.assertIs(dcm, known_dcm)
Esempio n. 16
0
    def test_create_a_new_cluster(self):
        meta = ClusterMetadata(project_id='test-project')
        _ = self.clusters.create(meta)

        # Derived fields are populated.
        self.assertTrue(meta.master_url.startswith('test-url'))
        self.assertEqual(meta.dashboard, 'test-dashboard')
        # The cluster is known.
        self.assertIn(meta, self.clusters.dataproc_cluster_managers)
        self.assertIn(meta.master_url, self.clusters.master_urls)
        # The default cluster is updated to the created cluster.
        self.assertIs(meta, self.clusters.default_cluster_metadata)
 def test_create_cluster_region_does_not_exist(self, mock_cluster_client):
   """
   Tests that an exception is thrown when a user specifies a region
   that does not exist.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.create_cluster, {})
     self.assertTrue('Invalid region provided' in context_manager.output[0])
 def test_create_cluster_default_already_exists(self, mock_cluster_client):
   """
   Tests that no exception is thrown when a cluster already exists,
   but is using ie.current_env().clusters.default_cluster_name.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='INFO') as context_manager:
     cluster_manager.create_cluster({})
     self.assertTrue('already exists' in context_manager.output[0])
 def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when cleanup attempts to delete
   a cluster that does not exist.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.cleanup)
     self.assertTrue('Cluster does not exist' in context_manager.output[0])
 def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when the exception is not handled by
   any other case under cleanup.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(MockException, cluster_manager.cleanup)
     self.assertTrue('Failed to delete cluster' in context_manager.output[0])
Esempio n. 21
0
    def test_cluster_metadata_default_value(self):
        cid_none = None
        cid_unknown_p = beam.Pipeline()
        cid_unknown_master_url = 'test-url'
        default_meta = ClusterMetadata(project_id='test-project')
        self.clusters.set_default_cluster(default_meta)

        self.assertIs(default_meta, self.clusters.cluster_metadata(cid_none))
        self.assertIs(default_meta,
                      self.clusters.cluster_metadata(cid_unknown_p))
        self.assertIs(default_meta,
                      self.clusters.cluster_metadata(cid_unknown_master_url))
Esempio n. 22
0
 def configure_for_flink(self, user_pipeline: beam.Pipeline,
                         options: PipelineOptions) -> None:
     """Tunes the pipeline options for the setup of running a job with Flink.
 """
     clusters = ie.current_env().clusters
     if clusters.pipelines.get(user_pipeline, None):
         # Noop for a known pipeline using a known Dataproc cluster.
         return
     flink_master = options.view_as(FlinkRunnerOptions).flink_master
     cluster_metadata = clusters.default_cluster_metadata
     if flink_master == '[auto]':
         # Try to create/reuse a cluster when no flink_master is given.
         project_id = options.view_as(GoogleCloudOptions).project
         region = options.view_as(GoogleCloudOptions).region
         if project_id:
             if clusters.default_cluster_metadata:
                 # Reuse the cluster name from default in case of a known cluster.
                 cluster_metadata = ClusterMetadata(
                     project_id=project_id,
                     region=region,
                     cluster_name=clusters.default_cluster_metadata.
                     cluster_name)
             else:
                 # Generate the metadata with a new unique cluster name.
                 cluster_metadata = ClusterMetadata(project_id=project_id,
                                                    region=region)
         # else use the default cluster metadata.
     elif flink_master in clusters.master_urls:
         cluster_metadata = clusters.cluster_metadata(flink_master)
     else:  # Noop if a self-hosted Flink is in use.
         return
     if not cluster_metadata:
         return  # Not even a default cluster to create/reuse, run Flink locally.
     dcm = clusters.create(cluster_metadata)
     # Side effects associated with the user_pipeline.
     clusters.pipelines[user_pipeline] = dcm
     dcm.pipelines.add(user_pipeline)
     flink_options = options.view_as(FlinkRunnerOptions)
     flink_options.flink_master = dcm.cluster_metadata.master_url
     flink_options.flink_version = clusters.DATAPROC_FLINK_VERSION
 def test_get_staging_location(self, mock_cluster_client, mock_list):
   """
   Test to receive a mock staging location successfully under
   get_staging_location.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project',
       region='test-region',
       cluster_name='test-cluster')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   self.assertEqual(
       cluster_manager.get_staging_location(),
       'gs://test-bucket/google-cloud-dataproc-metainfo/')
 def test_cleanup_permission_denied(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when a user is trying to delete
   a project that they have insufficient permissions for.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.cleanup)
     self.assertTrue(
         'Due to insufficient project permissions' in
         context_manager.output[0])
Esempio n. 25
0
    def test_reuse_a_cluster_for_a_known_pipeline(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        p = beam.Pipeline(runner=runner, options=options)
        meta = ClusterMetadata(project_id='test-project', region='test-region')
        dcm = DataprocClusterManager(meta)
        # Configure the clusters so that the pipeline is known.
        clusters.pipelines[p] = dcm
        runner.configure_for_flink(p, options)

        # A known cluster is reused.
        tuned_meta = clusters.cluster_metadata(p)
        self.assertIs(tuned_meta, meta)
Esempio n. 26
0
    def test_describe_everything_when_cluster_identifer_unknown(self):
        known_meta = ClusterMetadata(project_id='test-project')
        known_meta2 = ClusterMetadata(project_id='test-project',
                                      region='some-other-region')
        dcm = self.clusters.create(known_meta)
        dcm2 = self.clusters.create(known_meta2)
        p = beam.Pipeline()
        p2 = beam.Pipeline()
        self.clusters.pipelines[p] = dcm
        dcm.pipelines.add(p)
        self.clusters.pipelines[p2] = dcm2
        dcm.pipelines.add(p2)

        cid_pipeline = beam.Pipeline()
        meta_list = self.clusters.describe(cid_pipeline)
        self.assertEqual([known_meta, known_meta2], meta_list)

        cid_master_url = 'some-random-url'
        meta_list = self.clusters.describe(cid_master_url)
        self.assertEqual([known_meta, known_meta2], meta_list)

        cid_meta = ClusterMetadata(project_id='some-random-project')
        meta_list = self.clusters.describe(cid_meta)
        self.assertEqual([known_meta, known_meta2], meta_list)
 def test_parse_master_url_and_dashboard(self, mock_cluster_details):
   """
   Tests that parse_master_url_and_dashboard properly parses the input
   string and produces a mock master_url and mock dashboard link.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   line = 'test-line Found Web Interface test-master-url' \
   ' of application \'test-app-id\'.\n'
   master_url, dashboard = cluster_manager.parse_master_url_and_dashboard(line)
   self.assertEqual('test-master-url', master_url)
   self.assertEqual(
       'test-resource-manager/gateway/default/yarn/proxy/test-app-id/',
       dashboard)
Esempio n. 28
0
    def test_cleanup_noop_unknown_cluster(self):
        meta = ClusterMetadata(project_id='test-project')
        dcm = self.clusters.create(meta)
        p = beam.Pipeline()
        self.clusters.pipelines[p] = dcm
        dcm.pipelines.add(p)

        cid_pipeline = beam.Pipeline()
        self.clusters.cleanup(cid_pipeline)
        self.m_delete_cluster.assert_not_called()

        cid_master_url = 'some-random-url'
        self.clusters.cleanup(cid_master_url)
        self.m_delete_cluster.assert_not_called()

        cid_meta = ClusterMetadata(project_id='random-project')
        self.clusters.cleanup(cid_meta)
        self.m_delete_cluster.assert_not_called()

        self.assertIn(meta, self.clusters.dataproc_cluster_managers)
        self.assertIn(meta.master_url, self.clusters.master_urls)
        self.assertIs(meta, self.clusters.default_cluster_metadata)
        self.assertIn(p, self.clusters.pipelines)
        self.assertIn(p, dcm.pipelines)
 def test_get_cluster_details_permission_denied(self, mock_cluster_client):
   """
   Tests that an exception is thrown when a user is trying to get information
   for a project without sufficient permissions to do so.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(
       _LOGGER,
       level='ERROR') as context_manager, self.assertRaises(ValueError):
     cluster_manager.get_cluster_details()
     self.assertTrue(
         'Due to insufficient project permissions' in
         context_manager.output[0])
 def test_list_clusters(self):
     meta = ClusterMetadata(project_id='project')
     dcm = self.current_env.clusters.create(meta)
     p = beam.Pipeline()
     dcm.pipelines.add(p)
     self.current_env.clusters.pipelines[p] = dcm
     cluster_id = obfuscate(meta)
     self.assertEqual(
         {
             cluster_id: {
                 'cluster_name': meta.cluster_name,
                 'project': meta.project_id,
                 'region': meta.region,
                 'master_url': meta.master_url,
                 'dashboard': meta.dashboard,
                 'pipelines': [str(id(p)) for p in dcm.pipelines]
             }
         }, json.loads(self.current_env.inspector.list_clusters()))