def test_labels(self):
        pipeline_options = PipelineOptions([
            '--project', 'test_project', '--job_name', 'test_job_name',
            '--temp_location', 'gs://test-location/temp'
        ])
        job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
        self.assertIsNone(job.proto.labels)

        pipeline_options = PipelineOptions([
            '--project', 'test_project', '--job_name', 'test_job_name',
            '--temp_location', 'gs://test-location/temp', '--label',
            'key1=value1', '--label', 'key2', '--label', 'key3=value3',
            '--labels', 'key4=value4', '--labels', 'key5'
        ])
        job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
        self.assertEqual(5, len(job.proto.labels.additionalProperties))
        self.assertEqual('key1', job.proto.labels.additionalProperties[0].key)
        self.assertEqual('value1',
                         job.proto.labels.additionalProperties[0].value)
        self.assertEqual('key2', job.proto.labels.additionalProperties[1].key)
        self.assertEqual('', job.proto.labels.additionalProperties[1].value)
        self.assertEqual('key3', job.proto.labels.additionalProperties[2].key)
        self.assertEqual('value3',
                         job.proto.labels.additionalProperties[2].value)
        self.assertEqual('key4', job.proto.labels.additionalProperties[3].key)
        self.assertEqual('value4',
                         job.proto.labels.additionalProperties[3].value)
        self.assertEqual('key5', job.proto.labels.additionalProperties[4].key)
        self.assertEqual('', job.proto.labels.additionalProperties[4].value)
Example #2
0
    def run(self, pipeline):
        """Remotely executes entire pipeline or parts reachable from node."""
        # Import here to avoid adding the dependency for local running scenarios.
        # pylint: disable=wrong-import-order, wrong-import-position
        from apache_beam.runners.dataflow.internal import apiclient
        self.job = apiclient.Job(pipeline.options)

        # The superclass's run will trigger a traversal of all reachable nodes.
        super(DataflowRunner, self).run(pipeline)

        standard_options = pipeline.options.view_as(StandardOptions)
        if standard_options.streaming:
            job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION
        else:
            job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION

        # Get a Dataflow API client and set its options
        self.dataflow_client = apiclient.DataflowApplicationClient(
            pipeline.options, job_version)

        # Create the job
        result = DataflowPipelineResult(
            self.dataflow_client.create_job(self.job), self)

        self._metrics = DataflowMetrics(self.dataflow_client, result)
        result.metric_results = self._metrics
        return result
Example #3
0
 def test_graph_is_uploaded(self):
   pipeline_options = PipelineOptions([
       '--project',
       'test_project',
       '--job_name',
       'test_job_name',
       '--temp_location',
       'gs://test-location/temp',
       '--experiments',
       'beam_fn_api',
       '--experiments',
       'upload_graph'
   ])
   job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
   client = apiclient.DataflowApplicationClient(pipeline_options)
   with mock.patch.object(client, 'stage_file', side_effect=None):
     with mock.patch.object(client, 'create_job_description',
                            side_effect=None):
       with mock.patch.object(client,
                              'submit_job_description',
                              side_effect=None):
         client.create_job(job)
         client.stage_file.assert_called_once_with(
             mock.ANY, "dataflow_graph.json", mock.ANY)
         client.create_job_description.assert_called_once()
Example #4
0
    def test_translate_portable_job_step_name(self):
        mock_client, mock_job_result = self.setup_mock_client_result(
            self.ONLY_COUNTERS_LIST)

        pipeline_options = PipelineOptions([
            '--experiments=use_runner_v2',
            '--experiments=use_portable_job_submission',
            '--temp_location=gs://any-location/temp',
            '--project=dummy_project',
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        job = apiclient.Job(pipeline_options, proto_pipeline)
        dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result,
                                              job)
        self.assertEqual(
            'MyTestParDo',
            dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
Example #5
0
  def test_create_job_returns_existing_job(self):
    pipeline_options = PipelineOptions([
        '--project',
        'test_project',
        '--job_name',
        'test_job_name',
        '--temp_location',
        'gs://test-location/temp',
    ])
    job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
    self.assertTrue(job.proto.clientRequestId)  # asserts non-empty string
    pipeline_options.view_as(GoogleCloudOptions).no_auth = True
    client = apiclient.DataflowApplicationClient(pipeline_options)

    response = dataflow.Job()
    # different clientRequestId from `job`
    response.clientRequestId = "20210821081910123456-1234"
    response.name = 'test_job_name'
    response.id = '2021-08-19_21_18_43-9756917246311111021'

    with mock.patch.object(client._client.projects_locations_jobs,
                           'Create',
                           side_effect=[response]):
      with mock.patch.object(client, 'create_job_description',
                             side_effect=None):
        with self.assertRaises(
            apiclient.DataflowJobAlreadyExistsError) as context:
          client.create_job(job)

        self.assertEqual(
            str(context.exception),
            'There is already active job named %s with id: %s. If you want to '
            'submit a second job, try again by setting a different name using '
            '--job_name.' % ('test_job_name', response.id))
Example #6
0
 def test_transform_name_mapping(self, mock_job):
   pipeline_options = PipelineOptions(
       ['--project', 'test_project', '--job_name', 'test_job_name',
        '--temp_location', 'gs://test-location/temp', '--update',
        '--transform_name_mapping', '{\"from\":\"to\"}'])
   job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
   self.assertIsNotNone(job.proto.transformNameMapping)
Example #7
0
 def test_created_from_snapshot_id(self):
     pipeline_options = PipelineOptions([
         '--project', 'test_project', '--job_name', 'test_job_name',
         '--temp_location', 'gs://test-location/temp',
         '--create_from_snapshot', 'test_snapshot_id'
     ])
     job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
     self.assertEqual('test_snapshot_id', job.proto.createdFromSnapshotId)
Example #8
0
  def run_pipeline(self, pipeline):
    """Remotely executes entire pipeline or parts reachable from node."""
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apache_beam.runners.dataflow.internal import apiclient
    except ImportError:
      raise ImportError(
          'Google Cloud Dataflow runner not available, '
          'please install apache_beam[gcp]')

    # Snapshot the pipeline in a portable proto before mutating it
    proto_pipeline, self.proto_context = pipeline.to_runner_api(
        return_context=True)

    # Performing configured PTransform overrides.
    pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)

    # Add setup_options for all the BeamPlugin imports
    setup_options = pipeline._options.view_as(SetupOptions)
    plugins = BeamPlugin.get_all_plugin_paths()
    if setup_options.beam_plugins is not None:
      plugins = list(set(plugins + setup_options.beam_plugins))
    setup_options.beam_plugins = plugins

    self.job = apiclient.Job(pipeline._options, proto_pipeline)

    # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
    # here.
    pipeline.visit(self.group_by_key_input_visitor())

    # Dataflow runner requires output type of the Flatten to be the same as the
    # inputs, hence we enforce that here.
    pipeline.visit(self.flatten_input_visitor())

    # The superclass's run will trigger a traversal of all reachable nodes.
    super(DataflowRunner, self).run_pipeline(pipeline)

    test_options = pipeline._options.view_as(TestOptions)
    # If it is a dry run, return without submitting the job.
    if test_options.dry_run:
      return None

    # Get a Dataflow API client and set its options
    self.dataflow_client = apiclient.DataflowApplicationClient(
        pipeline._options)

    # Create the job description and send a request to the service. The result
    # can be None if there is no need to send a request to the service (e.g.
    # template creation). If a request was sent and failed then the call will
    # raise an exception.
    result = DataflowPipelineResult(
        self.dataflow_client.create_job(self.job), self)

    self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
    result.metric_results = self._metrics
    return result
Example #9
0
  def run(self, pipeline):
    """Remotely executes entire pipeline or parts reachable from node."""
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apache_beam.runners.dataflow.internal import apiclient
    except ImportError:
      raise ImportError(
          'Google Cloud Dataflow runner not available, '
          'please install apache_beam[gcp]')

    # Performing configured PTransform overrides.
    pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)

    # Add setup_options for all the BeamPlugin imports
    setup_options = pipeline._options.view_as(SetupOptions)
    plugins = BeamPlugin.get_all_plugin_paths()
    if setup_options.beam_plugins is not None:
      plugins = list(set(plugins + setup_options.beam_plugins))
    setup_options.beam_plugins = plugins

    self.job = apiclient.Job(pipeline._options)

    # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
    # here.
    pipeline.visit(self.group_by_key_input_visitor())

    # Dataflow runner requires output type of the Flatten to be the same as the
    # inputs, hence we enforce that here.
    pipeline.visit(self.flatten_input_visitor())

    # The superclass's run will trigger a traversal of all reachable nodes.
    super(DataflowRunner, self).run(pipeline)

    test_options = pipeline._options.view_as(TestOptions)
    # If it is a dry run, return without submitting the job.
    if test_options.dry_run:
      return None

    standard_options = pipeline._options.view_as(StandardOptions)
    if standard_options.streaming:
      job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION
    else:
      job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION

    # Get a Dataflow API client and set its options
    self.dataflow_client = apiclient.DataflowApplicationClient(
        pipeline._options, job_version)

    # Create the job
    result = DataflowPipelineResult(
        self.dataflow_client.create_job(self.job), self)

    self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
    result.metric_results = self._metrics
    return result
    def test_remote_runner_translation(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
         | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
         | ptransform.GroupByKey())
        remote_runner.job = apiclient.Job(p.options)
        super(DataflowRunner, remote_runner).run(p)
Example #11
0
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[1]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
Example #12
0
    def test_bad_path(self):
        dummy_sdk_file = tempfile.NamedTemporaryFile()
        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job',
                '--project=test-project', '--staging_location=ignored',
                '--temp_location=/dev/null', '--template_location=/bad/path',
                '--no_auth=True'
            ]))
        remote_runner.job = apiclient.Job(pipeline._options)

        with self.assertRaises(IOError):
            pipeline.run().wait_until_finish()
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Example #14
0
  def test_update_job_returns_existing_job(self):
    pipeline_options = PipelineOptions([
        '--project',
        'test_project',
        '--job_name',
        'test_job_name',
        '--temp_location',
        'gs://test-location/temp',
        '--region',
        'us-central1',
        '--update',
    ])
    replace_job_id = '2021-08-21_00_00_01-6081497447916622336'
    with mock.patch('apache_beam.runners.dataflow.internal.apiclient.Job.'
                    'job_id_for_name',
                    return_value=replace_job_id) as job_id_for_name_mock:
      job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
    job_id_for_name_mock.assert_called_once()

    self.assertTrue(job.proto.clientRequestId)  # asserts non-empty string

    pipeline_options.view_as(GoogleCloudOptions).no_auth = True
    client = apiclient.DataflowApplicationClient(pipeline_options)

    response = dataflow.Job()
    # different clientRequestId from `job`
    response.clientRequestId = "20210821083254123456-1234"
    response.name = 'test_job_name'
    response.id = '2021-08-19_21_29_07-5725551945600207770'

    with mock.patch.object(client, 'create_job_description', side_effect=None):
      with mock.patch.object(client._client.projects_locations_jobs,
                             'Create',
                             side_effect=[response]):

        with self.assertRaises(
            apiclient.DataflowJobAlreadyExistsError) as context:
          client.create_job(job)

      self.assertEqual(
          str(context.exception),
          'The job named %s with id: %s has already been updated into job '
          'id: %s and cannot be updated again.' %
          ('test_job_name', replace_job_id, response.id))
Example #15
0
  def test_template_file_generation_with_upload_graph(self):
    pipeline_options = PipelineOptions([
        '--project',
        'test_project',
        '--job_name',
        'test_job_name',
        '--temp_location',
        'gs://test-location/temp',
        '--experiments',
        'upload_graph',
        '--template_location',
        'gs://test-location/template'
    ])
    job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
    job.proto.steps.append(dataflow.Step(name='test_step_name'))

    pipeline_options.view_as(GoogleCloudOptions).no_auth = True
    client = apiclient.DataflowApplicationClient(pipeline_options)
    with mock.patch.object(client, 'stage_file', side_effect=None):
      with mock.patch.object(client, 'create_job_description',
                             side_effect=None):
        with mock.patch.object(client,
                               'submit_job_description',
                               side_effect=None):
          client.create_job(job)

          client.stage_file.assert_has_calls([
              mock.call(mock.ANY, 'dataflow_graph.json', mock.ANY),
              mock.call(mock.ANY, 'template', mock.ANY)
          ])
          client.create_job_description.assert_called_once()
          # template is generated, but job should not be submitted to the
          # service.
          client.submit_job_description.assert_not_called()

          template_filename = client.stage_file.call_args_list[-1][0][1]
          self.assertTrue('template' in template_filename)
          template_content = client.stage_file.call_args_list[-1][0][2].read(
          ).decode('utf-8')
          template_obj = json.loads(template_content)
          self.assertFalse(template_obj.get('steps'))
          self.assertTrue(template_obj['stepsLocation'])
Example #16
0
  def run(self, pipeline):
    """Remotely executes entire pipeline or parts reachable from node."""
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apache_beam.runners.dataflow.internal import apiclient
    except ImportError:
      raise ImportError(
          'Google Cloud Dataflow runner not available, '
          'please install apache_beam[gcp]')
    self.job = apiclient.Job(pipeline._options)

    # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
    # here.
    pipeline.visit(self.group_by_key_input_visitor())

    # Dataflow runner requires output type of the Flatten to be the same as the
    # inputs, hence we enforce that here.
    pipeline.visit(self.flatten_input_visitor())

    # The superclass's run will trigger a traversal of all reachable nodes.
    super(DataflowRunner, self).run(pipeline)

    standard_options = pipeline._options.view_as(StandardOptions)
    if standard_options.streaming:
      job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION
    else:
      job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION

    # Get a Dataflow API client and set its options
    self.dataflow_client = apiclient.DataflowApplicationClient(
        pipeline._options, job_version)

    # Create the job
    result = DataflowPipelineResult(
        self.dataflow_client.create_job(self.job), self)

    self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
    result.metric_results = self._metrics
    return result