def test_reuse_default_cluster_if_not_configured(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions() # Pipeline is not configured to run on Cloud. p = beam.Pipeline(runner=runner, options=options) meta = ClusterMetadata(project_id='test-project', region='test-region') meta.master_url = 'test-url' meta.dashboard = 'test-dashboard' dcm = DataprocClusterManager(meta) # Configure the clusters so that a default cluster is known. clusters.dataproc_cluster_managers[meta] = dcm clusters.set_default_cluster(meta) runner.configure_for_flink(p, options) # The default cluster is used. tuned_meta = clusters.cluster_metadata(p) self.assertIs(tuned_meta, clusters.default_cluster_metadata) # The pipeline is known. self.assertIn(p, clusters.pipelines) registered_dcm = clusters.pipelines[p] self.assertIn(p, registered_dcm.pipelines) # The pipeline options is tuned for execution on the cluster. flink_options = options.view_as(FlinkRunnerOptions) self.assertEqual(flink_options.flink_master, tuned_meta.master_url) self.assertEqual(flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
def test_create_a_new_cluster_for_a_new_pipeline(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') p = beam.Pipeline(runner=runner, options=options) runner.configure_for_flink(p, options) # Fetch the metadata and assert all side effects. meta = clusters.cluster_metadata(p) # The metadata should have all fields populated. self.assertEqual(meta.project_id, 'test-project') self.assertEqual(meta.region, 'test-region') self.assertTrue(meta.cluster_name.startswith('interactive-beam-')) self.assertTrue(meta.master_url.startswith('test-url')) self.assertEqual(meta.dashboard, 'test-dashboard') # The cluster is known now. self.assertIn(meta, clusters.dataproc_cluster_managers) self.assertIn(meta.master_url, clusters.master_urls) self.assertIn(p, clusters.pipelines) # The default cluster is updated to the created cluster. self.assertIs(meta, clusters.default_cluster_metadata) # The pipeline options is tuned for execution on the cluster. flink_options = options.view_as(FlinkRunnerOptions) self.assertEqual(flink_options.flink_master, meta.master_url) self.assertEqual(flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
def test_get_master_url_flink_master_provided(self): runner = interactive_runner.InteractiveRunner() from apache_beam.runners.portability.flink_runner import FlinkRunner p = beam.Pipeline( interactive_runner.InteractiveRunner(underlying_runner=FlinkRunner()), options=PipelineOptions(flink_master='--flink_master=test.internal:1')) runner._get_dataproc_cluster_master_url_if_applicable(p) self.assertEqual(ie.current_env().clusters.describe(), {}) ie.current_env().clusters = ib.Clusters()
def test_configure_flink_options(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') p = beam.Pipeline(runner=runner, options=options) runner.configure_for_flink(p, options) flink_options = options.view_as(FlinkRunnerOptions) self.assertEqual(flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION) self.assertTrue(flink_options.flink_master.startswith('test-url-'))
def test_get_master_url_no_flink_master_or_provided_master_url( self, mock_create_cluster): from apache_beam.runners.portability.flink_runner import FlinkRunner runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) p = beam.Pipeline(options=PipelineOptions( project='test-project', region='test-region', )) runner._get_dataproc_cluster_master_url_if_applicable(p) self.assertEqual( ie.current_env().clusters.describe(p) ['cluster_metadata'].project_id, 'test-project') ie.current_env().clusters = ib.Clusters()
def test_configure_flink_options_with_flink_version_overridden(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') flink_options = options.view_as(FlinkRunnerOptions) flink_options.flink_version = 'test-version' p = beam.Pipeline(runner=runner, options=options) runner.configure_for_flink(p, options) # The version is overridden to the flink version used by the EMR solution, # currently only 1: Cloud Dataproc. self.assertEqual(flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
def test_reuse_a_cluster_for_a_known_pipeline(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') p = beam.Pipeline(runner=runner, options=options) meta = ClusterMetadata(project_id='test-project', region='test-region') dcm = DataprocClusterManager(meta) # Configure the clusters so that the pipeline is known. clusters.pipelines[p] = dcm runner.configure_for_flink(p, options) # A known cluster is reused. tuned_meta = clusters.cluster_metadata(p) self.assertIs(tuned_meta, meta)
def test_worker_options_to_cluster_metadata(self): clusters = self.current_env.clusters runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) options = PipelineOptions(project='test-project', region='test-region') worker_options = options.view_as(WorkerOptions) worker_options.num_workers = 2 worker_options.subnetwork = 'test-network' worker_options.machine_type = 'test-machine-type' p = beam.Pipeline(runner=runner, options=options) runner.configure_for_flink(p, options) configured_meta = clusters.cluster_metadata(p) self.assertEqual(configured_meta.num_workers, worker_options.num_workers) self.assertEqual(configured_meta.subnetwork, worker_options.subnetwork) self.assertEqual(configured_meta.machine_type, worker_options.machine_type)
def test_get_master_url_no_flink_master_and_master_url_exists(self): from apache_beam.runners.portability.flink_runner import FlinkRunner runner = interactive_runner.InteractiveRunner( underlying_runner=FlinkRunner()) p = beam.Pipeline(options=PipelineOptions( project='test-project', region='test-region', )) cluster_name = ie.current_env().clusters.default_cluster_name cluster_metadata = MasterURLIdentifier(project_id='test-project', region='test-region', cluster_name=cluster_name) ie.current_env().clusters.master_urls['test-url'] = cluster_metadata ie.current_env( ).clusters.master_urls_to_dashboards['test-url'] = 'test-dashboard' flink_master = runner._get_dataproc_cluster_master_url_if_applicable(p) self.assertEqual( ie.current_env().clusters.describe(p) ['cluster_metadata'].project_id, 'test-project') self.assertEqual(flink_master, ie.current_env().clusters.describe(p)['master_url'])
def test_detect_pipeline_underlying_runner(self): p = beam.Pipeline(InteractiveRunner(underlying_runner=FlinkRunner())) pipeline_runner = utils.detect_pipeline_runner(p) self.assertTrue(isinstance(pipeline_runner, FlinkRunner))