def test_list_coder(self): list_coder = coders.ListCoder(coders.VarIntCoder()) # Test unnested self.check_coder(list_coder, [1], [-1, 0, 100]) # Test nested self.check_coder(coders.TupleCoder((coders.VarIntCoder(), list_coder)), (1, [1, 2, 3]))
def test_tuple_coder(self): kv_coder = coders.TupleCoder( (coders.VarIntCoder(), coders.BytesCoder())) # Verify cloud object representation self.assertEqual( { '@type': 'kind:pair', 'is_pair_like': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.BytesCoder().as_cloud_object() ], }, kv_coder.as_cloud_object()) # Test binary representation self.assertEqual('\x04abc', kv_coder.encode((4, 'abc'))) # Test unnested self.check_coder(kv_coder, (1, 'a'), (-2, 'a' * 100), (300, 'abc\0' * 5)) # Test nested self.check_coder( coders.TupleCoder((coders.TupleCoder( (coders.PickleCoder(), coders.VarIntCoder())), coders.StrUtf8Coder())), ((1, 2), 'a'), ((-2, 5), u'a\u0101' * 100), ((300, 1), 'abc\0' * 5))
def test_varint_coder(self): # Small ints. self.check_coder(coders.VarIntCoder(), *range(-10, 10)) # Multi-byte encoding starts at 128 self.check_coder(coders.VarIntCoder(), *range(120, 140)) # Large values MAX_64_BIT_INT = 0x7fffffffffffffff self.check_coder(coders.VarIntCoder(), *[int(math.pow(-1, k) * math.exp(k)) for k in range(0, int(math.log(MAX_64_BIT_INT)))])
def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def test_sharded_key_coder(self): key_and_coders = [(b'', b'\x00', coders.BytesCoder()), (b'key', b'\x03key', coders.BytesCoder()), ('key', b'\03\x6b\x65\x79', coders.StrUtf8Coder()), (('k', 1), b'\x01\x6b\x01', coders.TupleCoder( (coders.StrUtf8Coder(), coders.VarIntCoder())))] for key, bytes_repr, key_coder in key_and_coders: coder = coders.ShardedKeyCoder(key_coder) # Verify cloud object representation self.assertEqual( { '@type': 'kind:sharded_key', 'component_encodings': [key_coder.as_cloud_object()] }, coder.as_cloud_object()) self.assertEqual(b'\x00' + bytes_repr, coder.encode(ShardedKey(key, b''))) self.assertEqual(b'\x03123' + bytes_repr, coder.encode(ShardedKey(key, b'123'))) # Test unnested self.check_coder(coder, ShardedKey(key, b'')) self.check_coder(coder, ShardedKey(key, b'123')) for other_key, _, other_key_coder in key_and_coders: other_coder = coders.ShardedKeyCoder(other_key_coder) # Test nested self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b''), ShardedKey(other_key, b''))) self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b'123'), ShardedKey(other_key, b'')))
def int64_user_gauge(namespace, name, metric, ptransform=None, tag=None): # type: (...) -> metrics_pb2.MonitoringInfo """Return the gauge monitoring info for the URN, metric and labels. Args: namespace: User-defined namespace of counter. name: Name of counter. metric: The GaugeData containing the metrics. ptransform: The ptransform/step name used as a label. tag: The output tag name, used as a label. """ labels = create_labels(ptransform=ptransform, tag=tag, namespace=namespace, name=name) if isinstance(metric, GaugeData): coder = coders.VarIntCoder() value = metric.value timestamp = metric.timestamp else: raise TypeError( 'Expected GaugeData metric type but received %s with value %s' % (type(metric), metric)) payload = _encode_gauge(coder, timestamp, value) return create_monitoring_info(USER_GAUGE_URN, LATEST_INT64_TYPE, payload, labels)
def test_state_backed_iterable_coder(self): # pylint: disable=global-variable-undefined # required for pickling by reference global state state = {} def iterable_state_write(values, element_coder_impl): token = b'state_token_%d' % len(state) state[token] = [element_coder_impl.encode(e) for e in values] return token def iterable_state_read(token, element_coder_impl): return [element_coder_impl.decode(s) for s in state[token]] coder = coders.StateBackedIterableCoder( coders.VarIntCoder(), read_state=iterable_state_read, write_state=iterable_state_write, write_state_threshold=1) # Note: do not use check_coder # see https://github.com/cloudpipe/cloudpickle/issues/452 self._observe(coder) self.assertEqual([1, 2, 3], coder.decode(coder.encode([1, 2, 3]))) # Ensure that state was actually used. self.assertNotEqual(state, {}) tupleCoder = coders.TupleCoder((coder, coder)) self._observe(tupleCoder) self.assertEqual(([1], [2, 3]), tupleCoder.decode(tupleCoder.encode(([1], [2, 3]))))
def test_iterable_coder(self): iterable_coder = coders.IterableCoder(coders.VarIntCoder()) # Verify cloud object representation self.assertEqual({ '@type': 'kind:stream', 'is_stream_like': True, 'component_encodings': [coders.VarIntCoder().as_cloud_object()] }, iterable_coder.as_cloud_object()) # Test unnested self.check_coder(iterable_coder, [1], [-1, 0, 100]) # Test nested self.check_coder( coders.TupleCoder( (coders.VarIntCoder(), coders.IterableCoder(coders.VarIntCoder()))), (1, [1, 2, 3]))
def _decode_gauge(coder, payload): """Returns a tuple of (timestamp, value).""" timestamp_coder = coders.VarIntCoder().get_impl() stream = coder_impl.create_InputStream(payload) time_ms = timestamp_coder.decode_from_stream(stream, True) return (time_ms / 1000.0, coder.get_impl().decode_from_stream(stream, True))
def extract_gauge_value(monitoring_info_proto): """Returns a tuple containing (timestamp, value)""" if not is_gauge(monitoring_info_proto): raise ValueError('Unsupported type %s' % monitoring_info_proto.type) # Only LATEST_INT64_TYPE is currently supported. return _decode_gauge(coders.VarIntCoder(), monitoring_info_proto.payload)
def test_windowedvalue_coder_paneinfo(self): coder = coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()) test_paneinfo_values = [ windowed_value.PANE_INFO_UNKNOWN, windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.EARLY, 0, -1), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 10, 0), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 23), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 12, 23), windowed_value.PaneInfo( False, False, windowed_value.PaneInfoTiming.LATE, 0, 123),] test_values = [windowed_value.WindowedValue(123, 234, (GlobalWindow(),), p) for p in test_paneinfo_values] # Test unnested. self.check_coder(coder, windowed_value.WindowedValue( 123, 234, (GlobalWindow(),), windowed_value.PANE_INFO_UNKNOWN)) for value in test_values: self.check_coder(coder, value) # Test nested. for value1 in test_values: for value2 in test_values: self.check_coder(coders.TupleCoder((coder, coder)), (value1, value2))
def test_state_backed_iterable_coder(self): # pylint: disable=global-variable-undefined # required for pickling by reference global state state = {} def iterable_state_write(values, element_coder_impl): token = b'state_token_%d' % len(state) state[token] = [element_coder_impl.encode(e) for e in values] return token def iterable_state_read(token, element_coder_impl): return [element_coder_impl.decode(s) for s in state[token]] coder = coders.StateBackedIterableCoder( coders.VarIntCoder(), read_state=iterable_state_read, write_state=iterable_state_write, write_state_threshold=1) context = pipeline_context.PipelineContext( iterable_state_read=iterable_state_read, iterable_state_write=iterable_state_write) self.check_coder(coder, [1, 2, 3], context=context, test_size_estimation=False) # Ensure that state was actually used. self.assertNotEqual(state, {}) self.check_coder(coders.TupleCoder((coder, coder)), ([1], [2, 3]), context=context, test_size_estimation=False)
def extract_counter_value(monitoring_info_proto): """Returns the counter value of the monitoring info.""" if not is_counter(monitoring_info_proto): raise ValueError('Unsupported type %s' % monitoring_info_proto.type) # Only SUM_INT64_TYPE is currently supported. return coders.VarIntCoder().decode(monitoring_info_proto.payload)
def test_map_coder(self): self.check_coder( coders.MapCoder(coders.VarIntCoder(), coders.StrUtf8Coder()), { 1: "one", 300: "three hundred" }, {}, {i: str(i) for i in range(5000)})
def test_timer_coder(self): self.check_coder(coders._TimerCoder(coders.BytesCoder()), *[{'timestamp': timestamp.Timestamp(micros=x), 'payload': b'xyz'} for x in (-3000, 0, 3000)]) self.check_coder( coders.TupleCoder((coders._TimerCoder(coders.VarIntCoder()),)), ({'timestamp': timestamp.Timestamp.of(37000), 'payload': 389},))
def _decode_distribution(value_coder, payload): """Returns a tuple of (count, sum, min, max).""" count_coder = coders.VarIntCoder().get_impl() value_coder = value_coder.get_impl() stream = coder_impl.create_InputStream(payload) return (count_coder.decode_from_stream(stream, True), value_coder.decode_from_stream(stream, True), value_coder.decode_from_stream(stream, True), value_coder.decode_from_stream(stream, True))
def _test_iterable_coder_of_unknown_length(self, count): def iter_generator(count): for i in range(count): yield i iterable_coder = coders.IterableCoder(coders.VarIntCoder()) self.assertCountEqual(list(iter_generator(count)), iterable_coder.decode( iterable_coder.encode(iter_generator(count))))
def _encode_distribution(value_coder, count, sum, min, max): count_coder = coders.VarIntCoder().get_impl() value_coder = value_coder.get_impl() stream = coder_impl.create_OutputStream() count_coder.encode_to_stream(count, stream, True) value_coder.encode_to_stream(sum, stream, True) value_coder.encode_to_stream(min, stream, True) value_coder.encode_to_stream(max, stream, True) return stream.get()
def test_standard_int_coder(self): real_coder = typecoders.registry.get_coder(int) expected_coder = coders.VarIntCoder() self.assertEqual(real_coder.encode(0x0404), expected_coder.encode(0x0404)) self.assertEqual(0x0404, real_coder.decode(real_coder.encode(0x0404))) self.assertEqual(real_coder.encode(0x040404040404), expected_coder.encode(0x040404040404)) self.assertEqual(0x040404040404, real_coder.decode(real_coder.encode(0x040404040404)))
def test_windowed_value_coder(self): coder = coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual({ '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual( b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test decoding large timestamp self.assertEqual( coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'), windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), ))) # Test unnested self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test Global Window self.check_coder( coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()), window.GlobalWindows.windowed_value(1)) # Test nested self.check_coder( coders.TupleCoder(( coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), ( windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window', ))))
def distribution_payload_combiner(payload_a, payload_b): coder = coders.VarIntCoder() (count_a, sum_a, min_a, max_a) = _decode_distribution(coder, payload_a) (count_b, sum_b, min_b, max_b) = _decode_distribution(coder, payload_b) return _encode_distribution( coder, count_a + count_b, sum_a + sum_b, min(min_a, min_b), max(max_a, max_b))
def extract_distribution(monitoring_info_proto): """Returns a tuple of (count, sum, min, max). Args: proto: The monitoring info for the distribution. """ if not is_distribution(monitoring_info_proto): raise ValueError('Unsupported type %s' % monitoring_info_proto.type) # Only DISTRIBUTION_INT64_TYPE is currently supported. return _decode_distribution(coders.VarIntCoder(), monitoring_info_proto.payload)
def test_sharded_key_coder(self): key_and_coders = [(b'', b'\x00', coders.BytesCoder()), (b'key', b'\x03key', coders.BytesCoder()), ('key', b'\03\x6b\x65\x79', coders.StrUtf8Coder()), (('k', 1), b'\x01\x6b\x01', coders.TupleCoder( (coders.StrUtf8Coder(), coders.VarIntCoder())))] for key, bytes_repr, key_coder in key_and_coders: coder = coders.ShardedKeyCoder(key_coder) # Verify cloud object representation self.assertEqual( { '@type': 'kind:sharded_key', 'component_encodings': [key_coder.as_cloud_object()] }, coder.as_cloud_object()) # Test str repr self.assertEqual('%s' % coder, 'ShardedKeyCoder[%s]' % key_coder) self.assertEqual(b'\x00' + bytes_repr, coder.encode(ShardedKey(key, b''))) self.assertEqual(b'\x03123' + bytes_repr, coder.encode(ShardedKey(key, b'123'))) # Test unnested self.check_coder(coder, ShardedKey(key, b'')) self.check_coder(coder, ShardedKey(key, b'123')) # Test type hints self.assertTrue( isinstance(coder.to_type_hint(), sharded_key_type.ShardedKeyTypeConstraint)) key_type = coder.to_type_hint().key_type if isinstance(key_type, typehints.TupleConstraint): self.assertEqual(key_type.tuple_types, (type(key[0]), type(key[1]))) else: self.assertEqual(key_type, type(key)) self.assertEqual( coders.ShardedKeyCoder.from_type_hint( coder.to_type_hint(), typecoders.CoderRegistry()), coder) for other_key, _, other_key_coder in key_and_coders: other_coder = coders.ShardedKeyCoder(other_key_coder) # Test nested self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b''), ShardedKey(other_key, b''))) self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b'123'), ShardedKey(other_key, b'')))
def int64_user_distribution(namespace, name, metric, ptransform=None): """Return the distribution monitoring info for the URN, metric and labels. Args: urn: The URN of the monitoring info/metric. metric: The DistributionData for the metric. ptransform: The ptransform id used as a label. """ labels = create_labels(ptransform=ptransform, namespace=namespace, name=name) payload = _encode_distribution( coders.VarIntCoder(), metric.count, metric.sum, metric.min, metric.max) return create_monitoring_info( USER_DISTRIBUTION_URN, DISTRIBUTION_INT64_TYPE, payload, labels)
def test_map_coder(self): values = [ { 1: "one", 300: "three hundred" }, # force yapf to be nice {}, {i: str(i) for i in range(5000)} ] map_coder = coders.MapCoder(coders.VarIntCoder(), coders.StrUtf8Coder()) self.check_coder(map_coder, *values) self.check_coder(map_coder.as_deterministic_coder("label"), *values)
def int64_counter(urn, metric, ptransform=None, tag=None): # type: (...) -> metrics_pb2.MonitoringInfo """Return the counter monitoring info for the specifed URN, metric and labels. Args: urn: The URN of the monitoring info/metric. metric: The payload field to use in the monitoring info or an int value. ptransform: The ptransform/step name used as a label. tag: The output tag name, used as a label. """ labels = create_labels(ptransform=ptransform, tag=tag) if isinstance(metric, int): metric = coders.VarIntCoder().encode(metric) return create_monitoring_info(urn, SUM_INT64_TYPE, metric, labels)
def int64_user_counter(namespace, name, metric, ptransform=None): # type: (...) -> metrics_pb2.MonitoringInfo """Return the counter monitoring info for the specifed URN, metric and labels. Args: urn: The URN of the monitoring info/metric. metric: The payload field to use in the monitoring info or an int value. ptransform: The ptransform id used as a label. """ labels = create_labels(ptransform=ptransform, namespace=namespace, name=name) if isinstance(metric, int): metric = coders.VarIntCoder().encode(metric) return create_monitoring_info( USER_COUNTER_URN, SUM_INT64_TYPE, metric, labels)
def int64_distribution(urn, metric, ptransform=None, pcollection=None): # type: (...) -> metrics_pb2.MonitoringInfo """Return a distribution monitoring info for the URN, metric and labels. Args: urn: The URN of the monitoring info/metric. metric: The DistributionData for the metric. ptransform: The ptransform id used as a label. pcollection: The pcollection id used as a label. """ labels = create_labels(ptransform=ptransform, pcollection=pcollection) payload = _encode_distribution(coders.VarIntCoder(), metric.count, metric.sum, metric.min, metric.max) return create_monitoring_info(urn, DISTRIBUTION_INT64_TYPE, payload, labels)
def int64_counter(urn, metric, ptransform=None, pcollection=None, labels=None): # type: (...) -> metrics_pb2.MonitoringInfo """Return the counter monitoring info for the specifed URN, metric and labels. Args: urn: The URN of the monitoring info/metric. metric: The payload field to use in the monitoring info or an int value. ptransform: The ptransform id used as a label. pcollection: The pcollection id used as a label. """ labels = labels or {} labels.update(create_labels(ptransform=ptransform, pcollection=pcollection)) if isinstance(metric, int): metric = coders.VarIntCoder().encode(metric) return create_monitoring_info(urn, SUM_INT64_TYPE, metric, labels)
def int64_gauge(urn, metric, ptransform=None): # type: (...) -> metrics_pb2.MonitoringInfo """Return the gauge monitoring info for the URN, metric and labels. Args: urn: The URN of the monitoring info/metric. metric: An int representing the value. The current time will be used for the timestamp. ptransform: The ptransform id used as a label. """ labels = create_labels(ptransform=ptransform) if isinstance(metric, int): value = metric time_ms = int(time.time()) * 1000 else: raise TypeError( 'Expected int metric type but received %s with value %s' % (type(metric), metric)) coder = coders.VarIntCoder() payload = coder.encode(time_ms) + coder.encode(value) return create_monitoring_info(urn, LATEST_INT64_TYPE, payload, labels)