Example #1
0
    def test_reuse_default_cluster_if_not_configured(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions()
        # Pipeline is not configured to run on Cloud.
        p = beam.Pipeline(runner=runner, options=options)
        meta = ClusterMetadata(project_id='test-project', region='test-region')
        meta.master_url = 'test-url'
        meta.dashboard = 'test-dashboard'
        dcm = DataprocClusterManager(meta)
        # Configure the clusters so that a default cluster is known.
        clusters.dataproc_cluster_managers[meta] = dcm
        clusters.set_default_cluster(meta)
        runner.configure_for_flink(p, options)

        # The default cluster is used.
        tuned_meta = clusters.cluster_metadata(p)
        self.assertIs(tuned_meta, clusters.default_cluster_metadata)
        # The pipeline is known.
        self.assertIn(p, clusters.pipelines)
        registered_dcm = clusters.pipelines[p]
        self.assertIn(p, registered_dcm.pipelines)
        # The pipeline options is tuned for execution on the cluster.
        flink_options = options.view_as(FlinkRunnerOptions)
        self.assertEqual(flink_options.flink_master, tuned_meta.master_url)
        self.assertEqual(flink_options.flink_version,
                         clusters.DATAPROC_FLINK_VERSION)
Example #2
0
    def test_create_a_new_cluster_for_a_new_pipeline(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        p = beam.Pipeline(runner=runner, options=options)
        runner.configure_for_flink(p, options)

        # Fetch the metadata and assert all side effects.
        meta = clusters.cluster_metadata(p)
        # The metadata should have all fields populated.
        self.assertEqual(meta.project_id, 'test-project')
        self.assertEqual(meta.region, 'test-region')
        self.assertTrue(meta.cluster_name.startswith('interactive-beam-'))
        self.assertTrue(meta.master_url.startswith('test-url'))
        self.assertEqual(meta.dashboard, 'test-dashboard')
        # The cluster is known now.
        self.assertIn(meta, clusters.dataproc_cluster_managers)
        self.assertIn(meta.master_url, clusters.master_urls)
        self.assertIn(p, clusters.pipelines)
        # The default cluster is updated to the created cluster.
        self.assertIs(meta, clusters.default_cluster_metadata)
        # The pipeline options is tuned for execution on the cluster.
        flink_options = options.view_as(FlinkRunnerOptions)
        self.assertEqual(flink_options.flink_master, meta.master_url)
        self.assertEqual(flink_options.flink_version,
                         clusters.DATAPROC_FLINK_VERSION)
Example #3
0
 def test_get_master_url_flink_master_provided(self):
   runner = interactive_runner.InteractiveRunner()
   from apache_beam.runners.portability.flink_runner import FlinkRunner
   p = beam.Pipeline(
       interactive_runner.InteractiveRunner(underlying_runner=FlinkRunner()),
       options=PipelineOptions(flink_master='--flink_master=test.internal:1'))
   runner._get_dataproc_cluster_master_url_if_applicable(p)
   self.assertEqual(ie.current_env().clusters.describe(), {})
   ie.current_env().clusters = ib.Clusters()
Example #4
0
    def test_configure_flink_options(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        p = beam.Pipeline(runner=runner, options=options)
        runner.configure_for_flink(p, options)

        flink_options = options.view_as(FlinkRunnerOptions)
        self.assertEqual(flink_options.flink_version,
                         clusters.DATAPROC_FLINK_VERSION)
        self.assertTrue(flink_options.flink_master.startswith('test-url-'))
 def test_get_master_url_no_flink_master_or_provided_master_url(
         self, mock_create_cluster):
     from apache_beam.runners.portability.flink_runner import FlinkRunner
     runner = interactive_runner.InteractiveRunner(
         underlying_runner=FlinkRunner())
     p = beam.Pipeline(options=PipelineOptions(
         project='test-project',
         region='test-region',
     ))
     runner._get_dataproc_cluster_master_url_if_applicable(p)
     self.assertEqual(
         ie.current_env().clusters.describe(p)
         ['cluster_metadata'].project_id, 'test-project')
     ie.current_env().clusters = ib.Clusters()
Example #6
0
    def test_configure_flink_options_with_flink_version_overridden(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        flink_options = options.view_as(FlinkRunnerOptions)
        flink_options.flink_version = 'test-version'
        p = beam.Pipeline(runner=runner, options=options)
        runner.configure_for_flink(p, options)

        # The version is overridden to the flink version used by the EMR solution,
        # currently only 1: Cloud Dataproc.
        self.assertEqual(flink_options.flink_version,
                         clusters.DATAPROC_FLINK_VERSION)
Example #7
0
    def test_reuse_a_cluster_for_a_known_pipeline(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        p = beam.Pipeline(runner=runner, options=options)
        meta = ClusterMetadata(project_id='test-project', region='test-region')
        dcm = DataprocClusterManager(meta)
        # Configure the clusters so that the pipeline is known.
        clusters.pipelines[p] = dcm
        runner.configure_for_flink(p, options)

        # A known cluster is reused.
        tuned_meta = clusters.cluster_metadata(p)
        self.assertIs(tuned_meta, meta)
Example #8
0
    def test_worker_options_to_cluster_metadata(self):
        clusters = self.current_env.clusters
        runner = interactive_runner.InteractiveRunner(
            underlying_runner=FlinkRunner())
        options = PipelineOptions(project='test-project', region='test-region')
        worker_options = options.view_as(WorkerOptions)
        worker_options.num_workers = 2
        worker_options.subnetwork = 'test-network'
        worker_options.machine_type = 'test-machine-type'
        p = beam.Pipeline(runner=runner, options=options)
        runner.configure_for_flink(p, options)

        configured_meta = clusters.cluster_metadata(p)
        self.assertEqual(configured_meta.num_workers,
                         worker_options.num_workers)
        self.assertEqual(configured_meta.subnetwork, worker_options.subnetwork)
        self.assertEqual(configured_meta.machine_type,
                         worker_options.machine_type)
 def test_get_master_url_no_flink_master_and_master_url_exists(self):
     from apache_beam.runners.portability.flink_runner import FlinkRunner
     runner = interactive_runner.InteractiveRunner(
         underlying_runner=FlinkRunner())
     p = beam.Pipeline(options=PipelineOptions(
         project='test-project',
         region='test-region',
     ))
     cluster_name = ie.current_env().clusters.default_cluster_name
     cluster_metadata = MasterURLIdentifier(project_id='test-project',
                                            region='test-region',
                                            cluster_name=cluster_name)
     ie.current_env().clusters.master_urls['test-url'] = cluster_metadata
     ie.current_env(
     ).clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard'
     flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p)
     self.assertEqual(
         ie.current_env().clusters.describe(p)
         ['cluster_metadata'].project_id, 'test-project')
     self.assertEqual(flink_master,
                      ie.current_env().clusters.describe(p)['master_url'])
Example #10
0
 def test_detect_pipeline_underlying_runner(self):
     p = beam.Pipeline(InteractiveRunner(underlying_runner=FlinkRunner()))
     pipeline_runner = utils.detect_pipeline_runner(p)
     self.assertTrue(isinstance(pipeline_runner, FlinkRunner))