def decode_from_stream(self, in_stream, nested): value = self._value_coder.decode_from_stream(in_stream, nested) return windowed_value.create( value, self._timestamp, self._windows, self._pane_info)
def decode_from_stream(self, in_stream, nested): timestamp = self._to_normal_time(in_stream.read_bigendian_uint64()) # Restore MIN/MAX timestamps to their actual values as encoding incurs loss # of precision while converting to millis. # Note: This is only a best effort here as there is no way to know if these # were indeed MIN/MAX timestamps. # TODO(BEAM-1524): Clean this up once we have a BEAM wide consensus on # precision of timestamps. if timestamp <= -(abs(MIN_TIMESTAMP_micros) // 1000): timestamp = MIN_TIMESTAMP_micros elif timestamp >= MAX_TIMESTAMP_micros // 1000: timestamp = MAX_TIMESTAMP_micros else: timestamp *= 1000 windows = self._windows_coder.decode_from_stream(in_stream, True) # Read PaneInfo encoded byte. pane_info = self._pane_info_coder.decode_from_stream(in_stream, True) value = self._value_coder.decode_from_stream(in_stream, nested) return windowed_value.create( value, # Avoid creation of Timestamp object. timestamp, windows, pane_info)
def decode_from_stream(self, in_stream, nested): timestamp = self._to_normal_time(in_stream.read_bigendian_uint64()) # Restore MIN/MAX timestamps to their actual values as encoding incurs loss # of precision while converting to millis. # Note: This is only a best effort here as there is no way to know if these # were indeed MIN/MAX timestamps. # TODO(BEAM-1524): Clean this up once we have a BEAM wide consensus on # precision of timestamps. if timestamp <= -(abs(MIN_TIMESTAMP_micros) // 1000): timestamp = MIN_TIMESTAMP_micros elif timestamp >= MAX_TIMESTAMP_micros // 1000: timestamp = MAX_TIMESTAMP_micros else: timestamp *= 1000 windows = self._windows_coder.decode_from_stream(in_stream, True) # Read PaneInfo encoded byte. pane_info = self._pane_info_coder.decode_from_stream(in_stream, True) value = self._value_coder.decode_from_stream(in_stream, nested) return windowed_value.create( value, # Avoid creation of Timestamp object. timestamp, windows, pane_info)
def __init__(self, name, spec, counter_factory, sampler, consumers, operation_cls, keyed_state_backend): self._keyed_state_backend = keyed_state_backend self._reusable_windowed_value = windowed_value.create( None, -1, None, None) super(StatefulFunctionOperation, self).__init__(name, spec, counter_factory, sampler, consumers, operation_cls)
def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def test_windowed_value_coder(self): coder = coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual({ '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual( b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test decoding large timestamp self.assertEqual( coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'), windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), ))) # Test unnested self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test Global Window self.check_coder( coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()), window.GlobalWindows.windowed_value(1)) # Test nested self.check_coder( coders.TupleCoder(( coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), ( windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window', ))))
def test_windowed_value_coder(self): coder = coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual( { '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual('\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test decoding large timestamp self.assertEqual( coder.decode('\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'), windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(),))) # Test unnested self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test Global Window self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()), window.GlobalWindows.windowed_value(1)) # Test nested self.check_coder( coders.TupleCoder(( coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), (windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window',))))
class StandardCodersTest(unittest.TestCase): _urn_to_json_value_parser = { 'beam:coder:bytes:v1': lambda x: x.encode('utf-8'), 'beam:coder:string_utf8:v1': lambda x: x, 'beam:coder:varint:v1': lambda x: x, 'beam:coder:kv:v1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'beam:coder:interval_window:v1': lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'beam:coder:iterable:v1': lambda x, parser: list(map(parser, x)), 'beam:coder:global_window:v1': lambda x: window.GlobalWindow(), 'beam:coder:windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])), 'beam:coder:timer:v1': lambda x, payload_parser: dict(payload=payload_parser(x['payload']), timestamp=Timestamp(micros=x['timestamp' ] * 1000)), 'beam:coder:double:v1': parse_float, } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): def assert_equal(actual, expected): """Handle nan values which self.assertEqual fails on.""" if (isinstance(actual, float) and isinstance(expected, float) and math.isnan(actual) and math.isnan(expected)): return self.assertEqual(actual, expected) coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) decoded = decode_nested(coder, expected_encoded, nested) assert_equal(decoded, value) else: # Only verify decoding for a non-deterministic coder self.assertEqual( decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): context = pipeline_context.PipelineContext() coder_id = str(hash(str(spec))) component_ids = [ context.coders.get_id(self.parse_coder(c)) for c in spec.get('components', ()) ] context.coders.put_proto( coder_id, beam_runner_api_pb2.Coder(spec=beam_runner_api_pb2.FunctionSpec( urn=spec['urn'], payload=spec.get('payload')), component_coder_ids=component_ids)) return context.coders.get_by_id(coder_id) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ()) ] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print("FIXING", len(cls.to_fix), "TESTS") doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print(quote(expected_encoded), "->", quote(actual_encoded)) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
class StandardCodersTest(unittest.TestCase): _urn_to_json_value_parser = { 'beam:coder:bytes:v1': lambda x: x.encode('utf-8'), 'beam:coder:bool:v1': lambda x: x, 'beam:coder:string_utf8:v1': lambda x: x, 'beam:coder:varint:v1': lambda x: x, 'beam:coder:kv:v1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'beam:coder:interval_window:v1': lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'beam:coder:iterable:v1': lambda x, parser: list(map(parser, x)), 'beam:coder:global_window:v1': lambda x: window.GlobalWindow(), 'beam:coder:windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])), 'beam:coder:param_windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']]), PaneInfo(x['pane']['is_first'], x['pane']['is_last'], PaneInfoTiming.from_string(x['pane']['timing']), x['pane'] ['index'], x['pane']['on_time_index'])), 'beam:coder:timer:v1': lambda x, value_parser, window_parser: userstate.Timer( user_key=value_parser(x['userKey']), dynamic_timer_tag=x['dynamicTimerTag'], clear_bit=x['clearBit'], windows=tuple([window_parser(w) for w in x['windows']]), fire_timestamp=None, hold_timestamp=None, paneinfo=None) if x['clearBit'] else userstate.Timer( user_key=value_parser(x['userKey']), dynamic_timer_tag=x['dynamicTimerTag'], clear_bit=x['clearBit'], fire_timestamp=Timestamp(micros=x['fireTimestamp'] * 1000), hold_timestamp=Timestamp(micros=x['holdTimestamp'] * 1000), windows=tuple([window_parser(w) for w in x['windows']]), paneinfo=PaneInfo(x['pane']['is_first'], x['pane']['is_last'], PaneInfoTiming.from_string(x['pane']['timing']), x['pane']['index'], x['pane']['on_time_index'])), 'beam:coder:double:v1': parse_float, 'beam:coder:sharded_key:v1': lambda x, value_parser: ShardedKey( key=value_parser(x['key']), shard_id=x['shardId'].encode('utf-8')), 'beam:coder:custom_window:v1': lambda x, window_parser: window_parser(x['window']) } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): def assert_equal(actual, expected): """Handle nan values which self.assertEqual fails on.""" if (isinstance(actual, float) and isinstance(expected, float) and math.isnan(actual) and math.isnan(expected)): return self.assertEqual(actual, expected) coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) decoded = decode_nested(coder, expected_encoded, nested) assert_equal(decoded, value) else: # Only verify decoding for a non-deterministic coder self.assertEqual( decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): context = pipeline_context.PipelineContext() coder_id = str(hash(str(spec))) component_ids = [ context.coders.get_id(self.parse_coder(c)) for c in spec.get('components', ()) ] context.coders.put_proto( coder_id, beam_runner_api_pb2.Coder(spec=beam_runner_api_pb2.FunctionSpec( urn=spec['urn'], payload=spec.get('payload', '').encode('latin1')), component_coder_ids=component_ids)) return context.coders.get_by_id(coder_id) def json_value_parser(self, coder_spec): # TODO: integrate this with the logic for the other parsers if coder_spec['urn'] == 'beam:coder:row:v1': schema = schema_pb2.Schema.FromString( coder_spec['payload'].encode('latin1')) return value_parser_from_schema(schema) component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ()) ] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} # type: Dict[Tuple[int, bytes], bytes] @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print("FIXING", len(cls.to_fix), "TESTS") doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print(quote(expected_encoded), "->", quote(actual_encoded)) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
class StandardCodersTest(unittest.TestCase): _urn_to_coder_class = { 'urn:beam:coders:bytes:0.1': coders.BytesCoder, 'urn:beam:coders:varint:0.1': coders.VarIntCoder, 'urn:beam:coders:kv:0.1': lambda k, v: coders.TupleCoder((k, v)), 'urn:beam:coders:interval_window:0.1': coders.IntervalWindowCoder, 'urn:beam:coders:stream:0.1': lambda t: coders.IterableCoder(t), 'urn:beam:coders:global_window:0.1': coders.GlobalWindowCoder, 'urn:beam:coders:windowed_value:0.1': lambda v, w: coders.WindowedValueCoder(v, w) } _urn_to_json_value_parser = { 'urn:beam:coders:bytes:0.1': lambda x: x, 'urn:beam:coders:varint:0.1': lambda x: x, 'urn:beam:coders:kv:0.1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'urn:beam:coders:interval_window:0.1': lambda x: IntervalWindow( start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'urn:beam:coders:stream:0.1': lambda x, parser: map(parser, x), 'urn:beam:coders:global_window:0.1': lambda x: window.GlobalWindow(), 'urn:beam:coders:windowed_value:0.1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])) } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) self.assertEqual(decode_nested(coder, expected_encoded, nested), value) else: # Only verify decoding for a non-deterministic coder self.assertEqual(decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): return self._urn_to_coder_class[spec['urn']]( *[self.parse_coder(c) for c in spec.get('components', ())]) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ())] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print "FIXING", len(cls.to_fix), "TESTS" doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print quote(expected_encoded), "->", quote(actual_encoded) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
def decode_from_stream(self, in_stream, nested): return windowed_value.create( self._value_coder.decode_from_stream(in_stream, True), # Avoid creation of Timestamp object. in_stream.read_bigendian_int64(), self._windows_coder.decode_from_stream(in_stream, True))