def test_cleanup_does_not_exist(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when cleanup attempts to delete
   a cluster that does not exist.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.cleanup)
     self.assertTrue('Cluster does not exist' in context_manager.output[0])
 def test_cleanup_other_exception(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when the exception is not handled by
   any other case under cleanup.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(MockException, cluster_manager.cleanup)
     self.assertTrue('Failed to delete cluster' in context_manager.output[0])
 def test_create_cluster_region_does_not_exist(self, mock_cluster_client):
   """
   Tests that an exception is thrown when a user specifies a region
   that does not exist.
   """
   cluster_metadata = MasterURLIdentifier(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.create_cluster, {})
     self.assertTrue('Invalid region provided' in context_manager.output[0])
 def test_create_cluster_other_exception(self, mock_cluster_client):
   """
   Tests that an exception is thrown when the exception is not handled by
   any other case under _create_cluster.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(MockException, cluster_manager.create_cluster, {})
     self.assertTrue('Unable to create cluster' in context_manager.output[0])
 def test_create_cluster_permission_denied(self, mock_cluster_client):
     """
 Tests that an exception is thrown when a user is trying to write to
 a project while having insufficient permissions.
 """
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region')
     cluster_manager = DataprocClusterManager(cluster_metadata)
     from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
     with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
         self.assertRaises(ValueError, cluster_manager.create_cluster, {})
         self.assertTrue('Due to insufficient project permissions' in
                         context_manager.output[0])
Beispiel #6
0
 def test_clusters_cleanup_otherwise(self, mock_cleanup):
     clusters = ie.current_env().clusters
     project = 'test-project'
     region = 'test-region'
     p = beam.Pipeline(options=PipelineOptions(
         project=project,
         region=region,
     ))
     cluster_metadata = MasterURLIdentifier(project_id=project,
                                            region=region)
     clusters.dataproc_cluster_managers[str(
         id(p))] = DataprocClusterManager(cluster_metadata)
     clusters.dataproc_cluster_managers[str(id(p))].master_url = 'test_url'
     clusters.cleanup(p)
 def test_clusters_describe(self):
   clusters = ib.Clusters()
   project = 'test-project'
   region = 'test-region'
   p = beam.Pipeline(
       options=PipelineOptions(
           project=project,
           region=region,
       ))
   cluster_metadata = MasterURLIdentifier(project_id=project, region=region)
   clusters.dataproc_cluster_managers[p] = DataprocClusterManager(
       cluster_metadata)
   self.assertEqual('test-project', clusters.describe()[None] \
   ['cluster_metadata'].project_id)
 def test_cleanup_permission_denied(self, mock_cluster_client, mock_cleanup):
   """
   Tests that an exception is thrown when a user is trying to delete
   a project that they have insufficient permissions for.
   """
   cluster_metadata = ClusterMetadata(
       project_id='test-project', region='test-region')
   cluster_manager = DataprocClusterManager(cluster_metadata)
   from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import _LOGGER
   with self.assertLogs(_LOGGER, level='ERROR') as context_manager:
     self.assertRaises(ValueError, cluster_manager.cleanup)
     self.assertTrue(
         'Due to insufficient project permissions' in
         context_manager.output[0])
Beispiel #9
0
 def test_cleanup_all_dataproc_clusters(self, mock_cleanup):
   env = ie.InteractiveEnvironment()
   project = 'test-project'
   region = 'test-region'
   p = beam.Pipeline(
       options=PipelineOptions(
           project=project,
           region=region,
       ))
   cluster_metadata = MasterURLIdentifier(project_id=project, region=region)
   env.clusters.dataproc_cluster_managers[str(
       id(p))] = DataprocClusterManager(cluster_metadata)
   env._tracked_user_pipelines.add_user_pipeline(p)
   env.cleanup()
   self.assertEqual(env.clusters.dataproc_cluster_managers, {})
Beispiel #10
0
    def test_reuse_a_cluster_for_a_known_pipeline(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        p = beam.Pipeline(runner=runner, options=options)
        meta = ClusterMetadata(project_id='test-project', region='test-region')
        dcm = DataprocClusterManager(meta)
        # Configure the clusters so that the pipeline is known.
        clusters.pipelines[p] = dcm
        runner.configure_for_flink(p, options)

        # A known cluster is reused.
        tuned_meta = clusters.cluster_metadata(p)
        self.assertIs(tuned_meta, meta)
Beispiel #11
0
    def test_create_but_reuse_a_known_cluster(self):
        known_meta = ClusterMetadata(project_id='test-project',
                                     region='test-region')
        known_dcm = DataprocClusterManager(known_meta)
        known_meta.master_url = 'test-url'
        self.clusters.set_default_cluster(known_meta)
        self.clusters.dataproc_cluster_managers[known_meta] = known_dcm
        self.clusters.master_urls[known_meta.master_url] = known_meta

        # Use an equivalent meta as the identifier to create a cluster.
        cid_meta = ClusterMetadata(project_id=known_meta.project_id,
                                   region=known_meta.region,
                                   cluster_name=known_meta.cluster_name)
        dcm = self.clusters.create(cid_meta)
        # The known cluster manager is returned.
        self.assertIs(dcm, known_dcm)

        # Then use an equivalent master_url as the identifier.
        cid_master_url = known_meta.master_url
        dcm = self.clusters.create(cid_master_url)
        self.assertIs(dcm, known_dcm)
Beispiel #12
0
    def _create_dataproc_cluster_if_applicable(self, user_pipeline):
        """ Creates a Dataproc cluster if the provided user_pipeline is running
    FlinkRunner and no flink_master_url was provided as an option. A cluster
    is not created when a flink_master_url is detected.

    Example pipeline options to enable automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--project=my-project',
      '--region=my-region',
      '--environment_type=DOCKER'
      ])

    Example pipeline options to skip automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--flink_master=example.internal:41979',
      '--environment_type=DOCKER'
      ])
    """
        from apache_beam.runners.portability.flink_runner import FlinkRunner
        from apache_beam.options.pipeline_options import FlinkRunnerOptions
        flink_master = user_pipeline.options.view_as(
            FlinkRunnerOptions).flink_master
        clusters = ie.current_env().clusters
        # Only consider this logic when both below 2 conditions apply.
        if isinstance(self._underlying_runner,
                      FlinkRunner) and clusters.dataproc_cluster_managers.get(
                          str(id(user_pipeline)), None) is None:
            if flink_master == '[auto]':
                # The above condition is True when the user has not provided a
                # flink_master.
                if ie.current_env()._is_in_ipython:
                    warnings.filterwarnings(
                        'ignore',
                        'options is deprecated since First stable release. References to '
                        '<pipeline>.options will not be supported',
                        category=DeprecationWarning)
                project_id = (
                    user_pipeline.options.view_as(GoogleCloudOptions).project)
                region = (
                    user_pipeline.options.view_as(GoogleCloudOptions).region)
                cluster_name = ie.current_env().clusters.default_cluster_name
                cluster_metadata = MasterURLIdentifier(
                    project_id=project_id,
                    region=region,
                    cluster_name=cluster_name)
            else:
                cluster_metadata = clusters.master_urls.inverse.get(
                    flink_master, None)
            # else noop, no need to log anything because we allow a master_url
            # (not managed by us) provided by the user.
            if cluster_metadata:
                # create the cluster_manager and populate dicts in the clusters
                # instance if the pipeline is not already mapped to an existing
                # cluster_manager.
                cluster_manager = DataprocClusterManager(cluster_metadata)
                cluster_manager.create_flink_cluster()
                clusters.master_urls[
                    cluster_manager.master_url] = cluster_metadata
                clusters.dataproc_cluster_managers[str(
                    id(user_pipeline))] = cluster_manager
                clusters.master_urls_to_pipelines[
                    cluster_manager.master_url].append(str(id(user_pipeline)))