def test_validate_template_location(self): runner = MockRunners.OtherRunner() options = PipelineOptions([ '--template_location', 'abc', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertFalse(errors)
def test_zone_and_worker_zone_mutually_exclusive(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--zone', 'us-east1-b', '--worker_zone', 'us-east1-c', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertTrue(errors)
def test_dataflow_job_file_and_template_location_mutually_exclusive(self): runner = MockRunners.OtherRunner() options = PipelineOptions([ '--template_location', 'abc', '--dataflow_job_file', 'def' ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertTrue(errors)
def test_missing_required_options(self): options = PipelineOptions(['']) runner = MockRunners.DataflowRunner() validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual( self.check_errors_for_arguments( errors, ['project', 'staging_location', 'temp_location']), [])
def test_experiment_region_and_worker_region_mutually_exclusive(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--experiments', 'worker_region=us-west1', '--worker_region', 'us-east1', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertTrue(errors)
def get_validator(matcher): options = ['--project=example:example', '--job_name=job', '--staging_location=gs://foo/bar', '--temp_location=gs://foo/bar',] if matcher: options.append('%s=%s' % ('--on_success_matcher', matcher.decode())) pipeline_options = PipelineOptions(options) runner = MockRunners.TestDataflowRunner() return PipelineOptionsValidator(pipeline_options, runner)
def get_validator(job_name): options = ['--project=example:example', '--staging_location=gs://foo/bar', '--temp_location=gs://foo/bar'] if job_name is not None: options.append('--job_name=' + job_name) pipeline_options = PipelineOptions(options) runner = MockRunners.DataflowRunner() validator = PipelineOptionsValidator(pipeline_options, runner) return validator
def test_region_optional_for_non_service_runner(self): runner = MockRunners.DataflowRunner() # Remove default region for this test. runner.get_default_gcp_region = lambda: None options = PipelineOptions([ '--project=example:example', '--temp_location=gs://foo/bar', '--dataflow_endpoint=http://localhost:20281', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 0)
def test_missing_required_options(self): options = PipelineOptions(['']) runner = MockRunners.DataflowRunner() # Remove default region for this test. runner.get_default_gcp_region = lambda: None validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual( self.check_errors_for_arguments( errors, ['project', 'staging_location', 'temp_location', 'region']), [])
def test_num_workers_can_equal_max_num_workers(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--num_workers=42', '--max_num_workers=42', '--worker_region=us-east1', '--project=example:example', '--temp_location=gs://foo/bar', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 0)
def test_zone_alias_worker_zone(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--zone=us-east1-b', '--project=example:example', '--temp_location=gs://foo/bar', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 0) self.assertIsNone(options.view_as(WorkerOptions).zone) self.assertEqual(options.view_as(WorkerOptions).worker_zone, 'us-east1-b')
def test_worker_harness_sdk_container_image_mutually_exclusive(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--worker_harness_container_image=WORKER', '--sdk_container_image=SDK_ONLY', '--project=example:example', '--temp_location=gs://foo/bar', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 1) self.assertIn('sdk_container_image', errors[0]) self.assertIn('worker_harness_container_image', errors[0])
def test_max_num_workers_is_positive(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--max_num_workers=-1', '--worker_region=us-east1', '--project=example:example', '--temp_location=gs://foo/bar', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 1) self.assertIn('max_num_workers', errors[0]) self.assertIn('-1', errors[0])
def test_transform_name_mapping_without_update(self): options = ['--project=example:example', '--staging_location=gs://foo/bar', '--temp_location=gs://foo/bar', '--transform_name_mapping={\"fromPardo\":\"toPardo\"}'] pipeline_options = PipelineOptions(options) runner = MockRunners.DataflowRunner() validator = PipelineOptionsValidator(pipeline_options, runner) errors = validator.validate() assert_that(errors, only_contains( contains_string('Transform name mapping option is only useful when ' '--update and --streaming is specified')))
def get_validator(temp_location, staging_location): options = ['--project=example:example', '--job_name=job'] if temp_location is not None: options.append('--temp_location=' + temp_location) if staging_location is not None: options.append('--staging_location=' + staging_location) pipeline_options = PipelineOptions(options) runner = MockRunners.DataflowRunner() validator = PipelineOptionsValidator(pipeline_options, runner) return validator
def test_transform_name_mapping_invalid_format(self): options = [ '--project=example:example', '--staging_location=gs://foo/bar', '--temp_location=gs://foo/bar', '--update', '--job_name=test', '--streaming', '--transform_name_mapping={\"fromPardo\":123}' ] pipeline_options = PipelineOptions(options) runner = MockRunners.DataflowRunner() validator = PipelineOptionsValidator(pipeline_options, runner) errors = validator.validate() assert_that( errors, only_contains( contains_string('Invalid transform name mapping format.')))
def test_worker_region_and_worker_zone_mutually_exclusive(self): runner = MockRunners.DataflowRunner() options = PipelineOptions([ '--worker_region', 'us-east1', '--worker_zone', 'us-east1-b', '--project=example:example', '--temp_location=gs://foo/bar', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 1) self.assertIn('worker_region', errors[0]) self.assertIn('worker_zone', errors[0])
def test_alias_worker_harness_sdk_container_image(self): runner = MockRunners.DataflowRunner() test_image = "WORKER_HARNESS" options = PipelineOptions([ '--worker_harness_container_image=%s' % test_image, '--project=example:example', '--temp_location=gs://foo/bar', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 0) self.assertEqual( options.view_as(WorkerOptions).worker_harness_container_image, test_image) self.assertEqual( options.view_as(WorkerOptions).sdk_container_image, test_image)
def test_local_runner(self): runner = MockRunners.OtherRunner() options = PipelineOptions([]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertEqual(len(errors), 0)
def test_validate_dataflow_job_file(self): runner = MockRunners.OtherRunner() options = PipelineOptions(['--dataflow_job_file', 'abc']) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertFalse(errors)
def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner (~apache_beam.runners.runner.PipelineRunner): An object of type :class:`~apache_beam.runners.runner.PipelineRunner` that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options (~apache_beam.options.pipeline_options.PipelineOptions): A configured :class:`~apache_beam.options.pipeline_options.PipelineOptions` object containing arguments that should be used for running the Beam job. argv (List[str]): a list of arguments (such as :data:`sys.argv`) to be used for building a :class:`~apache_beam.options.pipeline_options.PipelineOptions` object. This will only be used if argument **options** is :data:`None`. Raises: ~exceptions.ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self._options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self._options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self._options = PipelineOptions([]) FileSystems.set_options(self._options) if runner is None: runner = self._options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info(('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # Validate pipeline options errors = PipelineOptionsValidator(self._options, runner).validate() if errors: raise ValueError( 'Pipeline has validations errors: \n' + '\n'.join(errors)) # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set()
def __init__(self, runner=None, options=None, argv=None): # type: (Optional[Union[str, PipelineRunner]], Optional[PipelineOptions], Optional[List[str]]) -> None """Initialize a pipeline object. Args: runner (~apache_beam.runners.runner.PipelineRunner): An object of type :class:`~apache_beam.runners.runner.PipelineRunner` that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options (~apache_beam.options.pipeline_options.PipelineOptions): A configured :class:`~apache_beam.options.pipeline_options.PipelineOptions` object containing arguments that should be used for running the Beam job. argv (List[str]): a list of arguments (such as :data:`sys.argv`) to be used for building a :class:`~apache_beam.options.pipeline_options.PipelineOptions` object. This will only be used if argument **options** is :data:`None`. Raises: ValueError: if either the runner or options argument is not of the expected type. """ # Initializing logging configuration in case the user did not set it up. logging.basicConfig() if options is not None: if isinstance(options, PipelineOptions): self._options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r' % options) elif argv is not None: if isinstance(argv, list): self._options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r' % argv) else: self._options = PipelineOptions([]) FileSystems.set_options(self._options) if runner is None: runner = self._options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info( ('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner %s is not a PipelineRunner object or the ' 'name of a registered runner.' % runner) # Validate pipeline options errors = PipelineOptionsValidator(self._options, runner).validate() if errors: raise ValueError('Pipeline has validations errors: \n' + '\n'.join(errors)) # set default experiments for portable runners # (needs to occur prior to pipeline construction) if runner.is_fnapi_compatible(): experiments = (self._options.view_as(DebugOptions).experiments or []) if not 'beam_fn_api' in experiments: experiments.append('beam_fn_api') self._options.view_as(DebugOptions).experiments = experiments # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() # type: Set[str]
def test_type_check_additional_unrecognized_feature(self): runner = MockRunners.OtherRunner() options = PipelineOptions(['--type_check_additional=all,dfgdf']) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertTrue(errors)