def test_streaming(self):
    pipeline_options = PipelineOptions(['--streaming'])
    runner = MockRunners.TestDataflowRunner()
    validator = PipelineOptionsValidator(pipeline_options, runner)
    errors = validator.validate()

    self.assertIn('Streaming pipelines are not supported.', errors)
Example #2
0
    def test_streaming(self):
        pipeline_options = PipelineOptions(['--streaming'])
        runner = MockRunners.TestDataflowRunner()
        validator = PipelineOptionsValidator(pipeline_options, runner)
        errors = validator.validate()

        self.assertIn('Streaming pipelines are not supported.', errors)
Example #3
0
 def test_dataflow_job_file_and_template_location_mutually_exclusive(self):
     runner = MockRunners.OtherRunner()
     options = PipelineOptions(
         ['--template_location', 'abc', '--dataflow_job_file', 'def'])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertTrue(errors)
Example #4
0
 def test_validate_template_location(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--template_location', 'abc',
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertFalse(errors)
 def test_validate_template_location(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--template_location', 'abc',
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertFalse(errors)
 def test_validate_dataflow_job_file(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--dataflow_job_file', 'abc'
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertFalse(errors)
Example #7
0
 def test_validate_dataflow_job_file(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--dataflow_job_file', 'abc'
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertFalse(errors)
 def test_dataflow_job_file_and_template_location_mutually_exclusive(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([
       '--template_location', 'abc',
       '--dataflow_job_file', 'def'
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertTrue(errors)
Example #9
0
 def test_worker_region_and_worker_zone_mutually_exclusive(self):
   runner = MockRunners.DataflowRunner()
   options = PipelineOptions([
       '--worker_region', 'us-east1',
       '--worker_zone', 'us-east1-b',
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertTrue(errors)
    def test_missing_required_options(self):
        options = PipelineOptions([''])
        runner = MockRunners.DataflowRunner()
        validator = PipelineOptionsValidator(options, runner)
        errors = validator.validate()

        self.assertEqual(
            self.check_errors_for_arguments(
                errors, ['project', 'staging_location', 'temp_location']), [])
  def test_missing_required_options(self):
    options = PipelineOptions([''])
    runner = MockRunners.DataflowRunner()
    validator = PipelineOptionsValidator(options, runner)
    errors = validator.validate()

    self.assertEqual(
        self.check_errors_for_arguments(
            errors,
            ['project', 'staging_location', 'temp_location']),
        [])
Example #12
0
 def test_num_workers_can_equal_max_num_workers(self):
     runner = MockRunners.DataflowRunner()
     options = PipelineOptions([
         '--num_workers=42',
         '--max_num_workers=42',
         '--worker_region=us-east1',
         '--project=example:example',
         '--temp_location=gs://foo/bar',
     ])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertEqual(len(errors), 0)
Example #13
0
 def test_zone_alias_worker_zone(self):
   runner = MockRunners.DataflowRunner()
   options = PipelineOptions([
       '--zone=us-east1-b',
       '--project=example:example',
       '--temp_location=gs://foo/bar',
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertEqual(len(errors), 0)
   self.assertIsNone(options.view_as(WorkerOptions).zone)
   self.assertEqual(options.view_as(WorkerOptions).worker_zone, 'us-east1-b')
Example #14
0
  def test_missing_required_options(self):
    options = PipelineOptions([''])
    runner = MockRunners.DataflowRunner()
    # Remove default region for this test.
    runner.get_default_gcp_region = lambda: None
    validator = PipelineOptionsValidator(options, runner)
    errors = validator.validate()

    self.assertEqual(
        self.check_errors_for_arguments(
            errors, ['project', 'staging_location', 'temp_location', 'region']),
        [])
Example #15
0
 def test_region_optional_for_non_service_runner(self):
   runner = MockRunners.DataflowRunner()
   # Remove default region for this test.
   runner.get_default_gcp_region = lambda: None
   options = PipelineOptions([
       '--project=example:example',
       '--temp_location=gs://foo/bar',
       '--dataflow_endpoint=http://localhost:20281',
   ])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertEqual(len(errors), 0)
Example #16
0
  def test_transform_name_mapping_without_update(self):
    options = ['--project=example:example',
               '--staging_location=gs://foo/bar',
               '--temp_location=gs://foo/bar',
               '--transform_name_mapping={\"fromPardo\":\"toPardo\"}']

    pipeline_options = PipelineOptions(options)
    runner = MockRunners.DataflowRunner()
    validator = PipelineOptionsValidator(pipeline_options, runner)
    errors = validator.validate()
    assert_that(errors, only_contains(
        contains_string('Transform name mapping option is only useful when '
                        '--update and --streaming is specified')))
Example #17
0
 def test_max_num_workers_is_positive(self):
     runner = MockRunners.DataflowRunner()
     options = PipelineOptions([
         '--max_num_workers=-1',
         '--worker_region=us-east1',
         '--project=example:example',
         '--temp_location=gs://foo/bar',
     ])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertEqual(len(errors), 1)
     self.assertIn('max_num_workers', errors[0])
     self.assertIn('-1', errors[0])
 def test_worker_harness_sdk_container_image_mutually_exclusive(self):
     runner = MockRunners.DataflowRunner()
     options = PipelineOptions([
         '--worker_harness_container_image=WORKER',
         '--sdk_container_image=SDK_ONLY',
         '--project=example:example',
         '--temp_location=gs://foo/bar',
     ])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertEqual(len(errors), 1)
     self.assertIn('sdk_container_image', errors[0])
     self.assertIn('worker_harness_container_image', errors[0])
Example #19
0
    def test_transform_name_mapping_invalid_format(self):
        options = [
            '--project=example:example', '--staging_location=gs://foo/bar',
            '--temp_location=gs://foo/bar', '--update', '--job_name=test',
            '--streaming', '--transform_name_mapping={\"fromPardo\":123}'
        ]

        pipeline_options = PipelineOptions(options)
        runner = MockRunners.DataflowRunner()
        validator = PipelineOptionsValidator(pipeline_options, runner)
        errors = validator.validate()
        assert_that(
            errors,
            only_contains(
                contains_string('Invalid transform name mapping format.')))
Example #20
0
 def test_worker_region_and_worker_zone_mutually_exclusive(self):
     runner = MockRunners.DataflowRunner()
     options = PipelineOptions([
         '--worker_region',
         'us-east1',
         '--worker_zone',
         'us-east1-b',
         '--project=example:example',
         '--temp_location=gs://foo/bar',
     ])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertEqual(len(errors), 1)
     self.assertIn('worker_region', errors[0])
     self.assertIn('worker_zone', errors[0])
  def test_is_service_runner(self):
    test_cases = [
        {
            'runner': MockRunners.OtherRunner(),
            'options': [],
            'expected': False,
        },
        {
            'runner': MockRunners.OtherRunner(),
            'options': ['--dataflow_endpoint=https://dataflow.googleapis.com'],
            'expected': False,
        },
        {
            'runner': MockRunners.OtherRunner(),
            'options': ['--dataflow_endpoint=https://dataflow.googleapis.com/'],
            'expected': False,
        },
        {
            'runner': MockRunners.DataflowRunner(),
            'options': ['--dataflow_endpoint=https://another.service.com'],
            'expected': False,
        },
        {
            'runner': MockRunners.DataflowRunner(),
            'options': ['--dataflow_endpoint=https://another.service.com/'],
            'expected': False,
        },
        {
            'runner': MockRunners.DataflowRunner(),
            'options': ['--dataflow_endpoint=https://dataflow.googleapis.com'],
            'expected': True,
        },
        {
            'runner': MockRunners.DataflowRunner(),
            'options': ['--dataflow_endpoint=https://dataflow.googleapis.com/'],
            'expected': True,
        },
        {
            'runner': MockRunners.DataflowRunner(),
            'options': [],
            'expected': True,
        },
    ]

    for case in test_cases:
      validator = PipelineOptionsValidator(
          PipelineOptions(case['options']), case['runner'])
      self.assertEqual(validator.is_service_runner(), case['expected'])
 def test_alias_worker_harness_sdk_container_image(self):
     runner = MockRunners.DataflowRunner()
     test_image = "WORKER_HARNESS"
     options = PipelineOptions([
         '--worker_harness_container_image=%s' % test_image,
         '--project=example:example',
         '--temp_location=gs://foo/bar',
     ])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertEqual(len(errors), 0)
     self.assertEqual(
         options.view_as(WorkerOptions).worker_harness_container_image,
         test_image)
     self.assertEqual(
         options.view_as(WorkerOptions).sdk_container_image, test_image)
Example #23
0
    def get_validator(matcher):
      options = ['--project=example:example',
                 '--job_name=job',
                 '--staging_location=gs://foo/bar',
                 '--temp_location=gs://foo/bar',]
      if matcher:
        options.append('%s=%s' % ('--on_success_matcher', matcher.decode()))

      pipeline_options = PipelineOptions(options)
      runner = MockRunners.TestDataflowRunner()
      return PipelineOptionsValidator(pipeline_options, runner)
Example #24
0
    def get_validator(job_name):
      options = ['--project=example:example', '--staging_location=gs://foo/bar',
                 '--temp_location=gs://foo/bar']

      if job_name is not None:
        options.append('--job_name=' + job_name)

      pipeline_options = PipelineOptions(options)
      runner = MockRunners.DataflowRunner()
      validator = PipelineOptionsValidator(pipeline_options, runner)
      return validator
Example #25
0
        def get_validator(temp_location, staging_location):
            options = ['--project=example:example', '--job_name=job']

            if temp_location is not None:
                options.append('--temp_location=' + temp_location)

            if staging_location is not None:
                options.append('--staging_location=' + staging_location)

            pipeline_options = PipelineOptions(options)
            runner = MockRunners.DataflowRunner()
            validator = PipelineOptionsValidator(pipeline_options, runner)
            return validator
    def test_type_check_additional(self):
        runner = MockRunners.OtherRunner()
        options = PipelineOptions(['--type_check_additional=all'])
        validator = PipelineOptionsValidator(options, runner)
        errors = validator.validate()
        self.assertFalse(errors)

        options = PipelineOptions(['--type_check_additional='])
        validator = PipelineOptionsValidator(options, runner)
        errors = validator.validate()
        self.assertFalse(errors)
Example #27
0
 def test_local_runner(self):
     runner = MockRunners.OtherRunner()
     options = PipelineOptions([])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertEqual(len(errors), 0)
Example #28
0
    def __init__(self, runner=None, options=None, argv=None):
        # type: (Optional[Union[str, PipelineRunner]], Optional[PipelineOptions], Optional[List[str]]) -> None
        """Initialize a pipeline object.

    Args:
      runner (~apache_beam.runners.runner.PipelineRunner): An object of
        type :class:`~apache_beam.runners.runner.PipelineRunner` that will be
        used to execute the pipeline. For registered runners, the runner name
        can be specified, otherwise a runner object must be supplied.
      options (~apache_beam.options.pipeline_options.PipelineOptions):
        A configured
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object
        containing arguments that should be used for running the Beam job.
      argv (List[str]): a list of arguments (such as :data:`sys.argv`)
        to be used for building a
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object.
        This will only be used if argument **options** is :data:`None`.

    Raises:
      ValueError: if either the runner or options argument is not
        of the expected type.
    """
        # Initializing logging configuration in case the user did not set it up.
        logging.basicConfig()

        if options is not None:
            if isinstance(options, PipelineOptions):
                self._options = options
            else:
                raise ValueError(
                    'Parameter options, if specified, must be of type PipelineOptions. '
                    'Received : %r' % options)
        elif argv is not None:
            if isinstance(argv, list):
                self._options = PipelineOptions(argv)
            else:
                raise ValueError(
                    'Parameter argv, if specified, must be a list. Received : %r'
                    % argv)
        else:
            self._options = PipelineOptions([])

        FileSystems.set_options(self._options)

        if runner is None:
            runner = self._options.view_as(StandardOptions).runner
            if runner is None:
                runner = StandardOptions.DEFAULT_RUNNER
                logging.info(
                    ('Missing pipeline option (runner). Executing pipeline '
                     'using the default runner: %s.'), runner)

        if isinstance(runner, str):
            runner = create_runner(runner)
        elif not isinstance(runner, PipelineRunner):
            raise TypeError('Runner %s is not a PipelineRunner object or the '
                            'name of a registered runner.' % runner)

        # Validate pipeline options
        errors = PipelineOptionsValidator(self._options, runner).validate()
        if errors:
            raise ValueError('Pipeline has validations errors: \n' +
                             '\n'.join(errors))

        # set default experiments for portable runners
        # (needs to occur prior to pipeline construction)
        if runner.is_fnapi_compatible():
            experiments = (self._options.view_as(DebugOptions).experiments
                           or [])
            if not 'beam_fn_api' in experiments:
                experiments.append('beam_fn_api')
                self._options.view_as(DebugOptions).experiments = experiments

        # Default runner to be used.
        self.runner = runner
        # Stack of transforms generated by nested apply() calls. The stack will
        # contain a root node as an enclosing (parent) node for top transforms.
        self.transforms_stack = [AppliedPTransform(None, None, '', None)]
        # Set of transform labels (full labels) applied to the pipeline.
        # If a transform is applied and the full label is already in the set
        # then the transform will have to be cloned with a new label.
        self.applied_labels = set()  # type: Set[str]
 def test_type_check_additional_unrecognized_feature(self):
     runner = MockRunners.OtherRunner()
     options = PipelineOptions(['--type_check_additional=all,dfgdf'])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertTrue(errors)
Example #30
0
  def __init__(self, runner=None, options=None, argv=None):
    """Initialize a pipeline object.

    Args:
      runner (~apache_beam.runners.runner.PipelineRunner): An object of
        type :class:`~apache_beam.runners.runner.PipelineRunner` that will be
        used to execute the pipeline. For registered runners, the runner name
        can be specified, otherwise a runner object must be supplied.
      options (~apache_beam.options.pipeline_options.PipelineOptions):
        A configured
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object
        containing arguments that should be used for running the Beam job.
      argv (List[str]): a list of arguments (such as :data:`sys.argv`)
        to be used for building a
        :class:`~apache_beam.options.pipeline_options.PipelineOptions` object.
        This will only be used if argument **options** is :data:`None`.

    Raises:
      ~exceptions.ValueError: if either the runner or options argument is not
        of the expected type.
    """
    if options is not None:
      if isinstance(options, PipelineOptions):
        self._options = options
      else:
        raise ValueError(
            'Parameter options, if specified, must be of type PipelineOptions. '
            'Received : %r', options)
    elif argv is not None:
      if isinstance(argv, list):
        self._options = PipelineOptions(argv)
      else:
        raise ValueError(
            'Parameter argv, if specified, must be a list. Received : %r', argv)
    else:
      self._options = PipelineOptions([])

    FileSystems.set_options(self._options)

    if runner is None:
      runner = self._options.view_as(StandardOptions).runner
      if runner is None:
        runner = StandardOptions.DEFAULT_RUNNER
        logging.info(('Missing pipeline option (runner). Executing pipeline '
                      'using the default runner: %s.'), runner)

    if isinstance(runner, str):
      runner = create_runner(runner)
    elif not isinstance(runner, PipelineRunner):
      raise TypeError('Runner must be a PipelineRunner object or the '
                      'name of a registered runner.')

    # Validate pipeline options
    errors = PipelineOptionsValidator(self._options, runner).validate()
    if errors:
      raise ValueError(
          'Pipeline has validations errors: \n' + '\n'.join(errors))

    # Default runner to be used.
    self.runner = runner
    # Stack of transforms generated by nested apply() calls. The stack will
    # contain a root node as an enclosing (parent) node for top transforms.
    self.transforms_stack = [AppliedPTransform(None, None, '', None)]
    # Set of transform labels (full labels) applied to the pipeline.
    # If a transform is applied and the full label is already in the set
    # then the transform will have to be cloned with a new label.
    self.applied_labels = set()
 def test_local_runner(self):
   runner = MockRunners.OtherRunner()
   options = PipelineOptions([])
   validator = PipelineOptionsValidator(options, runner)
   errors = validator.validate()
   self.assertEqual(len(errors), 0)