def __init__( self, proto=None, # type: Optional[Union[beam_runner_api_pb2.Components, beam_fn_api_pb2.ProcessBundleDescriptor]] default_environment=None, # type: Optional[environments.Environment] use_fake_coders=False, iterable_state_read=None, # type: Optional[IterableStateReader] iterable_state_write=None, # type: Optional[IterableStateWriter] namespace='ref', allow_proto_holders=False): if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor): proto = beam_runner_api_pb2.Components( coders=dict(proto.coders.items()), windowing_strategies=dict(proto.windowing_strategies.items()), environments=dict(proto.environments.items())) for name, cls in self._COMPONENT_TYPES.items(): setattr( self, name, _PipelineContextMap(self, cls, namespace, getattr(proto, name, None))) if default_environment: self._default_environment_id = self.environments.get_id( default_environment, label='default_environment') else: self._default_environment_id = None self.use_fake_coders = use_fake_coders self.iterable_state_read = iterable_state_read self.iterable_state_write = iterable_state_write self.allow_proto_holders = allow_proto_holders
def __init__(self, proto=None, default_environment=None, use_fake_coders=False, iterable_state_read=None, iterable_state_write=None, namespace='ref'): if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor): proto = beam_runner_api_pb2.Components( coders=dict(proto.coders.items()), windowing_strategies=dict(proto.windowing_strategies.items()), environments=dict(proto.environments.items())) for name, cls in self._COMPONENT_TYPES.items(): setattr( self, name, _PipelineContextMap(self, cls, namespace, getattr(proto, name, None))) if default_environment: self._default_environment_id = self.environments.get_id( Environment(default_environment), label='default_environment') else: self._default_environment_id = None self.use_fake_coders = use_fake_coders self.iterable_state_read = iterable_state_read self.iterable_state_write = iterable_state_write
def test_stage_resources(self): pipeline_options = PipelineOptions([ '--temp_location', 'gs://test-location/temp', '--staging_location', 'gs://test-location/staging', '--no_auth' ]) pipeline = beam_runner_api_pb2.Pipeline( components=beam_runner_api_pb2.Components( environments={ 'env1': beam_runner_api_pb2.Environment(dependencies=[ beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/foo1').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='foo1').SerializeToString()), beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/bar1').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='bar1').SerializeToString()) ]), 'env2': beam_runner_api_pb2.Environment(dependencies=[ beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/foo2').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='foo2').SerializeToString()), beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/bar2').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='bar2').SerializeToString()) ]) })) client = apiclient.DataflowApplicationClient(pipeline_options) with mock.patch.object(apiclient._LegacyDataflowStager, 'stage_job_resources') as mock_stager: client._stage_resources(pipeline, pipeline_options) mock_stager.assert_called_once_with( [('/tmp/foo1', 'foo1'), ('/tmp/bar1', 'bar1'), ('/tmp/foo2', 'foo2'), ('/tmp/bar2', 'bar2')], staging_location='gs://test-location/staging')
def __init__(self, proto=None): if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor): proto = beam_runner_api_pb2.Components( coders=dict(proto.coders.items()), windowing_strategies=dict(proto.windowing_strategies.items()), environments=dict(proto.environments.items())) for name, cls in self._COMPONENT_TYPES.items(): setattr(self, name, _PipelineContextMap(self, cls, getattr(proto, name, None)))
def __init__(self, proto=None, # type: Optional[Union[beam_runner_api_pb2.Components, beam_fn_api_pb2.ProcessBundleDescriptor]] component_id_map=None, # type: Optional[pipeline.ComponentIdMap] default_environment=None, # type: Optional[environments.Environment] use_fake_coders=False, # type: bool iterable_state_read=None, # type: Optional[IterableStateReader] iterable_state_write=None, # type: Optional[IterableStateWriter] namespace='ref', # type: str requirements=(), # type: Iterable[str] ): # type: (...) -> None if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor): proto = beam_runner_api_pb2.Components( coders=dict(proto.coders.items()), windowing_strategies=dict(proto.windowing_strategies.items()), environments=dict(proto.environments.items())) self.component_id_map = component_id_map or ComponentIdMap(namespace) assert self.component_id_map.namespace == namespace self.transforms = _PipelineContextMap( self, pipeline.AppliedPTransform, namespace, proto.transforms if proto is not None else None) self.pcollections = _PipelineContextMap( self, pvalue.PCollection, namespace, proto.pcollections if proto is not None else None) self.coders = _PipelineContextMap( self, coders.Coder, namespace, proto.coders if proto is not None else None) self.windowing_strategies = _PipelineContextMap( self, core.Windowing, namespace, proto.windowing_strategies if proto is not None else None) self.environments = _PipelineContextMap( self, environments.Environment, namespace, proto.environments if proto is not None else None) if default_environment: self._default_environment_id = self.environments.get_id( default_environment, label='default_environment') # type: Optional[str] else: self._default_environment_id = None self.use_fake_coders = use_fake_coders self.iterable_state_read = iterable_state_read self.iterable_state_write = iterable_state_write self._requirements = set(requirements)
def to_runner_api(self): # type: () -> beam_runner_api_pb2.Components context_proto = beam_runner_api_pb2.Components() self.transforms.populate_map(context_proto.transforms) self.pcollections.populate_map(context_proto.pcollections) self.coders.populate_map(context_proto.coders) self.windowing_strategies.populate_map(context_proto.windowing_strategies) self.environments.populate_map(context_proto.environments) return context_proto
def __init__(self, proto=None, default_environment_url=None): if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor): proto = beam_runner_api_pb2.Components( coders=dict(proto.coders.items()), windowing_strategies=dict(proto.windowing_strategies.items()), environments=dict(proto.environments.items())) for name, cls in self._COMPONENT_TYPES.items(): setattr(self, name, _PipelineContextMap(self, cls, getattr(proto, name, None))) if default_environment_url: self._default_environment_id = self.environments.get_id( Environment( beam_runner_api_pb2.Environment( url=default_environment_url, urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=default_environment_url). SerializeToString()))) else: self._default_environment_id = None
def to_runner_api(self): context_proto = beam_runner_api_pb2.Components() for name in self._COMPONENT_TYPES: getattr(self, name).populate_map(getattr(context_proto, name)) return context_proto
def executable_stage_transform( self, known_runner_urns, all_consumers, components): if (len(self.transforms) == 1 and self.transforms[0].spec.urn in known_runner_urns): return self.transforms[0] else: all_inputs = set( pcoll for t in self.transforms for pcoll in t.inputs.values()) all_outputs = set( pcoll for t in self.transforms for pcoll in t.outputs.values()) internal_transforms = set(id(t) for t in self.transforms) external_outputs = [pcoll for pcoll in all_outputs if all_consumers[pcoll] - internal_transforms] stage_components = beam_runner_api_pb2.Components() stage_components.CopyFrom(components) # Only keep the referenced PCollections. for pcoll_id in stage_components.pcollections.keys(): if pcoll_id not in all_inputs and pcoll_id not in all_outputs: del stage_components.pcollections[pcoll_id] # Only keep the transforms in this stage. # Also gather up payload data as we iterate over the transforms. stage_components.transforms.clear() main_inputs = set() side_inputs = [] user_states = [] timers = [] for ix, transform in enumerate(self.transforms): transform_id = 'transform_%d' % ix if transform.spec.urn == common_urns.primitives.PAR_DO.urn: payload = proto_utils.parse_Bytes( transform.spec.payload, beam_runner_api_pb2.ParDoPayload) for tag in payload.side_inputs.keys(): side_inputs.append( beam_runner_api_pb2.ExecutableStagePayload.SideInputId( transform_id=transform_id, local_name=tag)) for tag in payload.state_specs.keys(): user_states.append( beam_runner_api_pb2.ExecutableStagePayload.UserStateId( transform_id=transform_id, local_name=tag)) for tag in payload.timer_specs.keys(): timers.append( beam_runner_api_pb2.ExecutableStagePayload.TimerId( transform_id=transform_id, local_name=tag)) main_inputs.update( pcoll_id for tag, pcoll_id in transform.inputs.items() if tag not in payload.side_inputs) else: main_inputs.update(transform.inputs.values()) stage_components.transforms[transform_id].CopyFrom(transform) main_input_id = only_element(main_inputs - all_outputs) named_inputs = dict({ '%s:%s' % (side.transform_id, side.local_name): stage_components.transforms[side.transform_id].inputs[side.local_name] for side in side_inputs }, main_input=main_input_id) payload = beam_runner_api_pb2.ExecutableStagePayload( environment=components.environments[self.environment], input=main_input_id, outputs=external_outputs, transforms=stage_components.transforms.keys(), components=stage_components, side_inputs=side_inputs, user_states=user_states, timers=timers) return beam_runner_api_pb2.PTransform( unique_name=unique_name(None, self.name), spec=beam_runner_api_pb2.FunctionSpec( urn='beam:runner:executable_stage:v1', payload=payload.SerializeToString()), inputs=named_inputs, outputs={'output_%d' % ix: pcoll for ix, pcoll in enumerate(external_outputs)})