def __init__(self, state_handler, key_coder, namespace_coder, state_cache_size, map_state_read_cache_size, map_state_write_cache_size): self._state_handler = state_handler self._map_state_handler = CachingMapStateHandler( state_handler, map_state_read_cache_size) from pyflink.fn_execution.coders import FlattenRowCoder self._key_coder_impl = FlattenRowCoder(key_coder._field_coders).get_impl() self.namespace_coder = namespace_coder if namespace_coder: self._namespace_coder_impl = namespace_coder.get_impl() else: self._namespace_coder_impl = None self._state_cache_size = state_cache_size self._map_state_write_cache_size = map_state_write_cache_size self._all_states = {} self._internal_state_cache = LRUCache(self._state_cache_size, None) self._internal_state_cache.set_on_evict( lambda key, value: self.commit_internal_state(value)) self._current_key = None self._encoded_current_key = None self._clear_iterator_mark = beam_fn_api_pb2.StateKey( multimap_side_input=beam_fn_api_pb2.StateKey.MultimapSideInput( transform_id="clear_iterators", side_input_id="clear_iterators", key=self._encoded_current_key))
def __init__(self, state_handler, key_coder, state_cache_size, map_state_read_cache_size, map_state_write_cache_size): self._state_handler = state_handler self._map_state_handler = CachingMapStateHandler( state_handler, map_state_read_cache_size) try: from pyflink.fn_execution import coder_impl_fast is_fast = True if coder_impl_fast else False except: is_fast = False if not is_fast: self._key_coder_impl = key_coder.get_impl() else: from pyflink.fn_execution.coders import FlattenRowCoder self._key_coder_impl = FlattenRowCoder( key_coder._field_coders).get_impl() self._state_cache_size = state_cache_size self._map_state_write_cache_size = map_state_write_cache_size self._all_states = {} self._internal_state_cache = LRUCache(self._state_cache_size, None) self._internal_state_cache.set_on_evict( lambda key, value: self.commit_internal_state(value)) self._current_key = None self._encoded_current_key = None
def test_flatten_row_coder(self): field_coder = BigIntCoder() field_count = 10 coder = FlattenRowCoder([field_coder for _ in range(field_count)]).get_impl() v = [None if i % 2 == 0 else i for i in range(field_count)] generator_result = coder.decode(coder.encode(v)) result = [] for item in generator_result: result.append(item) self.assertEqual([v], result)
def _create_user_defined_function_operation(factory, transform_proto, consumers, udfs_proto, beam_operation_cls, internal_operation_cls): output_tags = list(transform_proto.outputs.keys()) output_coders = factory.get_output_coders(transform_proto) spec = operation_specs.WorkerDoFn( serialized_fn=udfs_proto, output_tags=output_tags, input=None, side_inputs=None, output_coders=[output_coders[tag] for tag in output_tags]) name = common.NameContext(transform_proto.unique_name) serialized_fn = spec.serialized_fn if hasattr(serialized_fn, "key_type"): # keyed operation, need to create the KeyedStateBackend. row_schema = serialized_fn.key_type.row_schema key_row_coder = FlattenRowCoder( [from_proto(f.type) for f in row_schema.fields]) if serialized_fn.HasField('group_window'): if serialized_fn.group_window.is_time_window: window_coder = TimeWindowCoder() else: window_coder = CountWindowCoder() else: window_coder = None keyed_state_backend = RemoteKeyedStateBackend( factory.state_handler, key_row_coder, window_coder, serialized_fn.state_cache_size, serialized_fn.map_state_read_cache_size, serialized_fn.map_state_write_cache_size) return beam_operation_cls(name, spec, factory.counter_factory, factory.state_sampler, consumers, internal_operation_cls, keyed_state_backend) elif internal_operation_cls == datastream_operations.StatefulOperation: key_row_coder = from_type_info_proto(serialized_fn.key_type_info) keyed_state_backend = RemoteKeyedStateBackend( factory.state_handler, key_row_coder, None, serialized_fn.state_cache_size, serialized_fn.map_state_read_cache_size, serialized_fn.map_state_write_cache_size) return beam_operation_cls(name, spec, factory.counter_factory, factory.state_sampler, consumers, internal_operation_cls, keyed_state_backend) else: return beam_operation_cls(name, spec, factory.counter_factory, factory.state_sampler, consumers, internal_operation_cls)
def __init__(self, state_handler, key_coder, state_cache_size): self._state_handler = state_handler try: from pyflink.fn_execution import coder_impl_fast is_fast = True if coder_impl_fast else False except: is_fast = False if not is_fast: self._key_coder_impl = key_coder.get_impl() else: from pyflink.fn_execution.coders import FlattenRowCoder self._key_coder_impl = FlattenRowCoder(key_coder._field_coders).get_impl() self._state_cache_size = state_cache_size self._all_states = {} self._all_internal_states = LRUCache(self._state_cache_size, None) self._all_internal_states.set_on_evict(lambda k, v: v.commit()) self._current_key = None self._encoded_current_key = None
def test_flatten_row_coder(self): field_coder = BigIntCoder() field_count = 10 coder = FlattenRowCoder([field_coder for _ in range(field_count)]) self.check_coder( coder, [None if i % 2 == 0 else i for i in range(field_count)])