Example #1
0
 def __init__(
         self,
         proto=None,  # type: Optional[Union[beam_runner_api_pb2.Components, beam_fn_api_pb2.ProcessBundleDescriptor]]
         default_environment=None,  # type: Optional[environments.Environment]
         use_fake_coders=False,
         iterable_state_read=None,  # type: Optional[IterableStateReader]
         iterable_state_write=None,  # type: Optional[IterableStateWriter]
         namespace='ref',
         allow_proto_holders=False):
     if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor):
         proto = beam_runner_api_pb2.Components(
             coders=dict(proto.coders.items()),
             windowing_strategies=dict(proto.windowing_strategies.items()),
             environments=dict(proto.environments.items()))
     for name, cls in self._COMPONENT_TYPES.items():
         setattr(
             self, name,
             _PipelineContextMap(self, cls, namespace,
                                 getattr(proto, name, None)))
     if default_environment:
         self._default_environment_id = self.environments.get_id(
             default_environment, label='default_environment')
     else:
         self._default_environment_id = None
     self.use_fake_coders = use_fake_coders
     self.iterable_state_read = iterable_state_read
     self.iterable_state_write = iterable_state_write
     self.allow_proto_holders = allow_proto_holders
Example #2
0
 def __init__(self,
              proto=None,
              default_environment=None,
              use_fake_coders=False,
              iterable_state_read=None,
              iterable_state_write=None,
              namespace='ref'):
     if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor):
         proto = beam_runner_api_pb2.Components(
             coders=dict(proto.coders.items()),
             windowing_strategies=dict(proto.windowing_strategies.items()),
             environments=dict(proto.environments.items()))
     for name, cls in self._COMPONENT_TYPES.items():
         setattr(
             self, name,
             _PipelineContextMap(self, cls, namespace,
                                 getattr(proto, name, None)))
     if default_environment:
         self._default_environment_id = self.environments.get_id(
             Environment(default_environment), label='default_environment')
     else:
         self._default_environment_id = None
     self.use_fake_coders = use_fake_coders
     self.iterable_state_read = iterable_state_read
     self.iterable_state_write = iterable_state_write
Example #3
0
 def test_stage_resources(self):
     pipeline_options = PipelineOptions([
         '--temp_location', 'gs://test-location/temp', '--staging_location',
         'gs://test-location/staging', '--no_auth'
     ])
     pipeline = beam_runner_api_pb2.Pipeline(
         components=beam_runner_api_pb2.Components(
             environments={
                 'env1':
                 beam_runner_api_pb2.Environment(dependencies=[
                     beam_runner_api_pb2.ArtifactInformation(
                         type_urn=common_urns.artifact_types.FILE.urn,
                         type_payload=beam_runner_api_pb2.
                         ArtifactFilePayload(
                             path='/tmp/foo1').SerializeToString(),
                         role_urn=common_urns.artifact_roles.STAGING_TO.urn,
                         role_payload=beam_runner_api_pb2.
                         ArtifactStagingToRolePayload(
                             staged_name='foo1').SerializeToString()),
                     beam_runner_api_pb2.ArtifactInformation(
                         type_urn=common_urns.artifact_types.FILE.urn,
                         type_payload=beam_runner_api_pb2.
                         ArtifactFilePayload(
                             path='/tmp/bar1').SerializeToString(),
                         role_urn=common_urns.artifact_roles.STAGING_TO.urn,
                         role_payload=beam_runner_api_pb2.
                         ArtifactStagingToRolePayload(
                             staged_name='bar1').SerializeToString())
                 ]),
                 'env2':
                 beam_runner_api_pb2.Environment(dependencies=[
                     beam_runner_api_pb2.ArtifactInformation(
                         type_urn=common_urns.artifact_types.FILE.urn,
                         type_payload=beam_runner_api_pb2.
                         ArtifactFilePayload(
                             path='/tmp/foo2').SerializeToString(),
                         role_urn=common_urns.artifact_roles.STAGING_TO.urn,
                         role_payload=beam_runner_api_pb2.
                         ArtifactStagingToRolePayload(
                             staged_name='foo2').SerializeToString()),
                     beam_runner_api_pb2.ArtifactInformation(
                         type_urn=common_urns.artifact_types.FILE.urn,
                         type_payload=beam_runner_api_pb2.
                         ArtifactFilePayload(
                             path='/tmp/bar2').SerializeToString(),
                         role_urn=common_urns.artifact_roles.STAGING_TO.urn,
                         role_payload=beam_runner_api_pb2.
                         ArtifactStagingToRolePayload(
                             staged_name='bar2').SerializeToString())
                 ])
             }))
     client = apiclient.DataflowApplicationClient(pipeline_options)
     with mock.patch.object(apiclient._LegacyDataflowStager,
                            'stage_job_resources') as mock_stager:
         client._stage_resources(pipeline, pipeline_options)
     mock_stager.assert_called_once_with(
         [('/tmp/foo1', 'foo1'), ('/tmp/bar1', 'bar1'),
          ('/tmp/foo2', 'foo2'), ('/tmp/bar2', 'bar2')],
         staging_location='gs://test-location/staging')
 def __init__(self, proto=None):
     if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor):
         proto = beam_runner_api_pb2.Components(
             coders=dict(proto.coders.items()),
             windowing_strategies=dict(proto.windowing_strategies.items()),
             environments=dict(proto.environments.items()))
     for name, cls in self._COMPONENT_TYPES.items():
         setattr(self, name,
                 _PipelineContextMap(self, cls, getattr(proto, name, None)))
Example #5
0
  def __init__(self,
               proto=None,  # type: Optional[Union[beam_runner_api_pb2.Components, beam_fn_api_pb2.ProcessBundleDescriptor]]
               component_id_map=None,  # type: Optional[pipeline.ComponentIdMap]
               default_environment=None,  # type: Optional[environments.Environment]
               use_fake_coders=False,  # type: bool
               iterable_state_read=None,  # type: Optional[IterableStateReader]
               iterable_state_write=None,  # type: Optional[IterableStateWriter]
               namespace='ref',  # type: str
               requirements=(),  # type: Iterable[str]
              ):
    # type: (...) -> None
    if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor):
      proto = beam_runner_api_pb2.Components(
          coders=dict(proto.coders.items()),
          windowing_strategies=dict(proto.windowing_strategies.items()),
          environments=dict(proto.environments.items()))

    self.component_id_map = component_id_map or ComponentIdMap(namespace)
    assert self.component_id_map.namespace == namespace

    self.transforms = _PipelineContextMap(
        self,
        pipeline.AppliedPTransform,
        namespace,
        proto.transforms if proto is not None else None)
    self.pcollections = _PipelineContextMap(
        self,
        pvalue.PCollection,
        namespace,
        proto.pcollections if proto is not None else None)
    self.coders = _PipelineContextMap(
        self,
        coders.Coder,
        namespace,
        proto.coders if proto is not None else None)
    self.windowing_strategies = _PipelineContextMap(
        self,
        core.Windowing,
        namespace,
        proto.windowing_strategies if proto is not None else None)
    self.environments = _PipelineContextMap(
        self,
        environments.Environment,
        namespace,
        proto.environments if proto is not None else None)

    if default_environment:
      self._default_environment_id = self.environments.get_id(
          default_environment,
          label='default_environment')  # type: Optional[str]
    else:
      self._default_environment_id = None
    self.use_fake_coders = use_fake_coders
    self.iterable_state_read = iterable_state_read
    self.iterable_state_write = iterable_state_write
    self._requirements = set(requirements)
Example #6
0
  def to_runner_api(self):
    # type: () -> beam_runner_api_pb2.Components
    context_proto = beam_runner_api_pb2.Components()

    self.transforms.populate_map(context_proto.transforms)
    self.pcollections.populate_map(context_proto.pcollections)
    self.coders.populate_map(context_proto.coders)
    self.windowing_strategies.populate_map(context_proto.windowing_strategies)
    self.environments.populate_map(context_proto.environments)

    return context_proto
Example #7
0
 def __init__(self, proto=None, default_environment_url=None):
     if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor):
         proto = beam_runner_api_pb2.Components(
             coders=dict(proto.coders.items()),
             windowing_strategies=dict(proto.windowing_strategies.items()),
             environments=dict(proto.environments.items()))
     for name, cls in self._COMPONENT_TYPES.items():
         setattr(self, name,
                 _PipelineContextMap(self, cls, getattr(proto, name, None)))
     if default_environment_url:
         self._default_environment_id = self.environments.get_id(
             Environment(
                 beam_runner_api_pb2.Environment(
                     url=default_environment_url,
                     urn=common_urns.environments.DOCKER.urn,
                     payload=beam_runner_api_pb2.DockerPayload(
                         container_image=default_environment_url).
                     SerializeToString())))
     else:
         self._default_environment_id = None
 def to_runner_api(self):
     context_proto = beam_runner_api_pb2.Components()
     for name in self._COMPONENT_TYPES:
         getattr(self, name).populate_map(getattr(context_proto, name))
     return context_proto
Example #9
0
  def executable_stage_transform(
      self, known_runner_urns, all_consumers, components):
    if (len(self.transforms) == 1
        and self.transforms[0].spec.urn in known_runner_urns):
      return self.transforms[0]

    else:
      all_inputs = set(
          pcoll for t in self.transforms for pcoll in t.inputs.values())
      all_outputs = set(
          pcoll for t in self.transforms for pcoll in t.outputs.values())
      internal_transforms = set(id(t) for t in self.transforms)
      external_outputs = [pcoll for pcoll in all_outputs
                          if all_consumers[pcoll] - internal_transforms]

      stage_components = beam_runner_api_pb2.Components()
      stage_components.CopyFrom(components)

      # Only keep the referenced PCollections.
      for pcoll_id in stage_components.pcollections.keys():
        if pcoll_id not in all_inputs and pcoll_id not in all_outputs:
          del stage_components.pcollections[pcoll_id]

      # Only keep the transforms in this stage.
      # Also gather up payload data as we iterate over the transforms.
      stage_components.transforms.clear()
      main_inputs = set()
      side_inputs = []
      user_states = []
      timers = []
      for ix, transform in enumerate(self.transforms):
        transform_id = 'transform_%d' % ix
        if transform.spec.urn == common_urns.primitives.PAR_DO.urn:
          payload = proto_utils.parse_Bytes(
              transform.spec.payload, beam_runner_api_pb2.ParDoPayload)
          for tag in payload.side_inputs.keys():
            side_inputs.append(
                beam_runner_api_pb2.ExecutableStagePayload.SideInputId(
                    transform_id=transform_id,
                    local_name=tag))
          for tag in payload.state_specs.keys():
            user_states.append(
                beam_runner_api_pb2.ExecutableStagePayload.UserStateId(
                    transform_id=transform_id,
                    local_name=tag))
          for tag in payload.timer_specs.keys():
            timers.append(
                beam_runner_api_pb2.ExecutableStagePayload.TimerId(
                    transform_id=transform_id,
                    local_name=tag))
          main_inputs.update(
              pcoll_id
              for tag, pcoll_id in transform.inputs.items()
              if tag not in payload.side_inputs)
        else:
          main_inputs.update(transform.inputs.values())
        stage_components.transforms[transform_id].CopyFrom(transform)

      main_input_id = only_element(main_inputs - all_outputs)
      named_inputs = dict({
          '%s:%s' % (side.transform_id, side.local_name):
          stage_components.transforms[side.transform_id].inputs[side.local_name]
          for side in side_inputs
      }, main_input=main_input_id)
      payload = beam_runner_api_pb2.ExecutableStagePayload(
          environment=components.environments[self.environment],
          input=main_input_id,
          outputs=external_outputs,
          transforms=stage_components.transforms.keys(),
          components=stage_components,
          side_inputs=side_inputs,
          user_states=user_states,
          timers=timers)

      return beam_runner_api_pb2.PTransform(
          unique_name=unique_name(None, self.name),
          spec=beam_runner_api_pb2.FunctionSpec(
              urn='beam:runner:executable_stage:v1',
              payload=payload.SerializeToString()),
          inputs=named_inputs,
          outputs={'output_%d' % ix: pcoll
                   for ix, pcoll in enumerate(external_outputs)})