def test_from_utc_datetime(self): self.assertEqual( Timestamp.from_utc_datetime(datetime.datetime(1970, 1, 1, tzinfo=pytz.utc)), Timestamp(0)) with self.assertRaisesRegexp(ValueError, r'UTC'): Timestamp.from_utc_datetime(datetime.datetime(1970, 1, 1))
def test_from_rfc3339(self): test_cases = [ (10000000, '1970-04-26T17:46:40Z'), (10000000.000001, '1970-04-26T17:46:40.000001Z'), (1458343379.123456, '2016-03-18T23:22:59.123456Z'), ] for seconds_float, rfc3339_str in test_cases: self.assertEqual(Timestamp(seconds_float), Timestamp.from_rfc3339(rfc3339_str)) self.assertEqual(rfc3339_str, Timestamp.from_rfc3339(rfc3339_str).to_rfc3339())
def finish_bundle(self): data = self._read_from_pubsub(self.source.timestamp_attribute) if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): Respect the PubSub source's id_label field. for timestamp, message in data: if self.source.with_attributes: element = message else: element = message.data bundle.output( GlobalWindows.windowed_value(element, timestamp=timestamp)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) # TODO(udim): Correct value for watermark hold. return TransformResult(self, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def _get_element(message): parsed_message = PubsubMessage._from_message(message) if timestamp_attribute: try: rfc3339_or_milli = parsed_message.attributes[timestamp_attribute] except KeyError as e: raise KeyError('Timestamp attribute not found: %s' % e) try: timestamp = Timestamp.from_rfc3339(rfc3339_or_milli) except ValueError: try: timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000) except ValueError as e: raise ValueError('Bad timestamp value: %s' % e) else: timestamp = Timestamp.from_rfc3339(message.service_timestamp) return timestamp, parsed_message
def __init__(self, start, end): if start is not None or end is not None: self._start_object = Timestamp.of(start) self._end_object = Timestamp.of(end) try: self._start_micros = self._start_object.micros except OverflowError: self._start_micros = ( MIN_TIMESTAMP.micros if self._start_object.micros < 0 else MAX_TIMESTAMP.micros) try: self._end_micros = self._end_object.micros except OverflowError: self._end_micros = ( MIN_TIMESTAMP.micros if self._end_object.micros < 0 else MAX_TIMESTAMP.micros) else: # Micros must be populated elsewhere. self._start_object = self._end_object = None
def __init__(self, value, timestamp, windows): # For performance reasons, only timestamp_micros is stored by default # (as a C int). The Timestamp object is created on demand below. self.value = value if isinstance(timestamp, int): self.timestamp_micros = timestamp * 1000000 else: self.timestamp_object = (timestamp if isinstance(timestamp, Timestamp) else Timestamp.of(timestamp)) self.timestamp_micros = self.timestamp_object.micros self.windows = windows
def finish_bundle(self): data = self._read_from_pubsub() if data: output_pcollection = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(output_pcollection) # TODO(ccy): we currently do not use the PubSub message timestamp or # respect the PubSub source's id_label field. now = Timestamp.of(time.time()) for message_data in data: bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now)) bundles = [bundle] else: bundles = [] if self._applied_ptransform.inputs: input_pvalue = self._applied_ptransform.inputs[0] else: input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline) unprocessed_bundle = self._evaluation_context.create_bundle( input_pvalue) return TransformResult( self._applied_ptransform, bundles, [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
def _get_element(message): parsed_message = PubsubMessage._from_message(message) if (timestamp_attribute and timestamp_attribute in parsed_message.attributes): rfc3339_or_milli = parsed_message.attributes[timestamp_attribute] try: timestamp = Timestamp.from_rfc3339(rfc3339_or_milli) except ValueError: try: timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000) except ValueError as e: raise ValueError('Bad timestamp value: %s' % e) else: timestamp = Timestamp(message.publish_time.seconds, message.publish_time.nanos // 1000) return timestamp, parsed_message
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. options = StandardOptions(streaming=True) options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_of(self): interval = Timestamp(123) self.assertEqual(id(interval), id(Timestamp.of(interval))) self.assertEqual(interval, Timestamp.of(123.0)) with self.assertRaises(TypeError): Timestamp.of(Duration(10))
def _getTimestampFromProto(): # type: () -> Timestamp ts_millis = int( common_urns.constants.GLOBAL_WINDOW_MAX_TIMESTAMP_MILLIS.constant) return Timestamp(micros=ts_millis * 1000)
def __init__(self, end): # type: (TimestampTypes) -> None self._end = Timestamp.of(end)
def __init__(self, processing_time, watermark): self._processing_time = Timestamp.of(processing_time) self._watermark = Timestamp.of(watermark)
def __init__(self, value, timestamp): self.value = value self.timestamp = Timestamp.of(timestamp)
def __init__(self, start, end): super(IntervalWindow, self).__init__(end) self.start = Timestamp.of(start)
def test_from_proto_fails_with_truncation(self): # TODO(https://github.com/apache/beam/issues/19922): Better define # timestamps. with self.assertRaises(ValueError): Timestamp.from_proto( timestamp_pb2.Timestamp(seconds=1234, nanos=56789))
def test_now(self): now = Timestamp.now() self.assertTrue(isinstance(now, Timestamp))
def to_language_type(self, value): # type: (MicrosInstantRepresentation) -> Timestamp return Timestamp(seconds=int(value.seconds), micros=int(value.micros))
def from_runner_api_parameter(fn_parameter, unused_context): return FixedWindows( size=Duration(micros=fn_parameter.size.ToMicroseconds()), offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()))
def __init__(self, end): self.end = Timestamp.of(end)
def __init__(self, size, period, offset=0): if size <= 0: raise ValueError('The size parameter must be strictly positive.') self.size = Duration.of(size) self.period = Duration.of(period) self.offset = Timestamp.of(offset) % period
def test_to_proto(self): ts = Timestamp(seconds=1234, micros=56) actual_ts_proto = Timestamp.to_proto(ts) expected_ts_proto = timestamp_pb2.Timestamp(seconds=1234, nanos=56000) self.assertEqual(actual_ts_proto, expected_ts_proto)
def __init__(self, timestamp, element=None, window=None): self.timestamp = Timestamp.of(timestamp) self.element = element self.window = window
def test_from_rfc3339_failure(self): with self.assertRaisesRegex(ValueError, 'parse'): Timestamp.from_rfc3339('not rfc3339') with self.assertRaisesRegex(ValueError, 'parse'): Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z unparseable')
def __init__(self, new_watermark, tag=None): self.new_watermark = Timestamp.of(new_watermark) self.tag = tag
def test_from_utc_datetime(self): self.assertEqual( Timestamp.from_utc_datetime( datetime.datetime(1970, 1, 1, tzinfo=pytz.utc)), Timestamp(0)) with self.assertRaisesRegex(ValueError, r'UTC'): Timestamp.from_utc_datetime(datetime.datetime(1970, 1, 1))
def decode_from_stream(self, in_stream, nested): return Timestamp(micros=in_stream.read_bigendian_int64())
def test_arithmetic(self): # Supported operations. self.assertEqual(Timestamp(123) + 456, 579) self.assertEqual(Timestamp(123) + Duration(456), 579) self.assertEqual(456 + Timestamp(123), 579) self.assertEqual(Duration(456) + Timestamp(123), 579) self.assertEqual(Timestamp(123) - 456, -333) self.assertEqual(Timestamp(123) - Duration(456), -333) self.assertEqual(Timestamp(1230) % 456, 318) self.assertEqual(Timestamp(1230) % Duration(456), 318) self.assertEqual(Timestamp(123) - Timestamp(100), 23) # Check that direct comparison of Timestamp and Duration is allowed. self.assertTrue(Duration(123) == Timestamp(123)) self.assertTrue(Timestamp(123) == Duration(123)) self.assertFalse(Duration(123) == Timestamp(1230)) self.assertFalse(Timestamp(123) == Duration(1230)) # Check return types. self.assertEqual((Timestamp(123) + 456).__class__, Timestamp) self.assertEqual((Timestamp(123) + Duration(456)).__class__, Timestamp) self.assertEqual((456 + Timestamp(123)).__class__, Timestamp) self.assertEqual((Duration(456) + Timestamp(123)).__class__, Timestamp) self.assertEqual((Timestamp(123) - 456).__class__, Timestamp) self.assertEqual((Timestamp(123) - Duration(456)).__class__, Timestamp) self.assertEqual((Timestamp(1230) % 456).__class__, Duration) self.assertEqual((Timestamp(1230) % Duration(456)).__class__, Duration) self.assertEqual((Timestamp(123) - Timestamp(100)).__class__, Duration) # Unsupported operations. with self.assertRaises(TypeError): self.assertEqual(Timestamp(123) * 456, 56088) with self.assertRaises(TypeError): self.assertEqual(Timestamp(123) * Duration(456), 56088) with self.assertRaises(TypeError): self.assertEqual(456 * Timestamp(123), 56088) with self.assertRaises(TypeError): self.assertEqual(Duration(456) * Timestamp(123), 56088) with self.assertRaises(TypeError): self.assertEqual(456 - Timestamp(123), 333) with self.assertRaises(TypeError): self.assertEqual(Duration(456) - Timestamp(123), 333) with self.assertRaises(TypeError): self.assertEqual(-Timestamp(123), -123) # pylint: disable=invalid-unary-operand-type with self.assertRaises(TypeError): self.assertEqual(-Timestamp(123), -Duration(123)) # pylint: disable=invalid-unary-operand-type with self.assertRaises(TypeError): self.assertEqual(1230 % Timestamp(456), 318) with self.assertRaises(TypeError): self.assertEqual(Duration(1230) % Timestamp(456), 318)
def __init__(self, value, timestamp): # type: (Any, TimestampTypes) -> None self.value = value self.timestamp = Timestamp.of(timestamp)
class RowCoderTest(unittest.TestCase): JON_SNOW = Person( name="Jon Snow", age=np.int32(23), address=None, aliases=["crow", "wildling"], knows_javascript=False, payload=None, custom_metadata={}, favorite_time=Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z'), ) PEOPLE = [ JON_SNOW, Person( "Daenerys Targaryen", np.int32(25), "Westeros", ["Mother of Dragons"], False, None, {"dragons": 3}, Timestamp.from_rfc3339('1970-04-26T17:46:40Z'), ), Person( "Michael Bluth", np.int32(30), None, [], True, b"I've made a huge mistake", {}, Timestamp.from_rfc3339('2020-08-12T15:51:00.032Z')) ] def test_create_row_coder_from_named_tuple(self): expected_coder = RowCoder(typing_to_runner_api(Person).row_type.schema) real_coder = coders_registry.get_coder(Person) for test_case in self.PEOPLE: self.assertEqual( expected_coder.encode(test_case), real_coder.encode(test_case)) self.assertEqual( test_case, real_coder.decode(real_coder.encode(test_case))) def test_create_row_coder_from_schema(self): schema = schema_pb2.Schema( id="person", fields=[ schema_pb2.Field( name="name", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)), schema_pb2.Field( name="age", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)), schema_pb2.Field( name="address", type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING, nullable=True)), schema_pb2.Field( name="aliases", type=schema_pb2.FieldType( array_type=schema_pb2.ArrayType( element_type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING)))), schema_pb2.Field( name="knows_javascript", type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)), schema_pb2.Field( name="payload", type=schema_pb2.FieldType( atomic_type=schema_pb2.BYTES, nullable=True)), schema_pb2.Field( name="custom_metadata", type=schema_pb2.FieldType( map_type=schema_pb2.MapType( key_type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING), value_type=schema_pb2.FieldType( atomic_type=schema_pb2.INT64), ))), schema_pb2.Field( name="favorite_time", type=schema_pb2.FieldType( logical_type=schema_pb2.LogicalType( urn="beam:logical_type:micros_instant:v1", representation=schema_pb2.FieldType( row_type=schema_pb2.RowType( schema=schema_pb2.Schema( id="micros_instant", fields=[ schema_pb2.Field( name="seconds", type=schema_pb2.FieldType( atomic_type=schema_pb2.INT64)), schema_pb2.Field( name="micros", type=schema_pb2.FieldType( atomic_type=schema_pb2.INT64)), ])))))), ]) coder = RowCoder(schema) for test_case in self.PEOPLE: self.assertEqual(test_case, coder.decode(coder.encode(test_case))) @unittest.skip( "BEAM-8030 - Overflow behavior in VarIntCoder is currently inconsistent") def test_overflows(self): IntTester = typing.NamedTuple( 'IntTester', [ # TODO(BEAM-7996): Test int8 and int16 here as well when those # types are supported # ('i8', typing.Optional[np.int8]), # ('i16', typing.Optional[np.int16]), ('i32', typing.Optional[np.int32]), ('i64', typing.Optional[np.int64]), ]) c = RowCoder.from_type_hint(IntTester, None) no_overflow = chain( (IntTester(i32=i, i64=None) for i in (-2**31, 2**31 - 1)), (IntTester(i32=None, i64=i) for i in (-2**63, 2**63 - 1)), ) # Encode max/min ints to make sure they don't throw any error for case in no_overflow: c.encode(case) overflow = chain( (IntTester(i32=i, i64=None) for i in (-2**31 - 1, 2**31)), (IntTester(i32=None, i64=i) for i in (-2**63 - 1, 2**63)), ) # Encode max+1/min-1 ints to make sure they DO throw an error # pylint: disable=cell-var-from-loop for case in overflow: self.assertRaises(OverflowError, lambda: c.encode(case)) def test_none_in_non_nullable_field_throws(self): Test = typing.NamedTuple('Test', [('foo', str)]) c = RowCoder.from_type_hint(Test, None) self.assertRaises(ValueError, lambda: c.encode(Test(foo=None))) def test_schema_remove_column(self): fields = [("field1", str), ("field2", str)] # new schema is missing one field that was in the old schema Old = typing.NamedTuple('Old', fields) New = typing.NamedTuple('New', fields[:-1]) old_coder = RowCoder.from_type_hint(Old, None) new_coder = RowCoder.from_type_hint(New, None) self.assertEqual( New("foo"), new_coder.decode(old_coder.encode(Old("foo", "bar")))) def test_schema_add_column(self): fields = [("field1", str), ("field2", typing.Optional[str])] # new schema has one (optional) field that didn't exist in the old schema Old = typing.NamedTuple('Old', fields[:-1]) New = typing.NamedTuple('New', fields) old_coder = RowCoder.from_type_hint(Old, None) new_coder = RowCoder.from_type_hint(New, None) self.assertEqual( New("bar", None), new_coder.decode(old_coder.encode(Old("bar")))) def test_schema_add_column_with_null_value(self): fields = [("field1", typing.Optional[str]), ("field2", str), ("field3", typing.Optional[str])] # new schema has one (optional) field that didn't exist in the old schema Old = typing.NamedTuple('Old', fields[:-1]) New = typing.NamedTuple('New', fields) old_coder = RowCoder.from_type_hint(Old, None) new_coder = RowCoder.from_type_hint(New, None) self.assertEqual( New(None, "baz", None), new_coder.decode(old_coder.encode(Old(None, "baz")))) def test_row_coder_picklable(self): # occasionally coders can get pickled, RowCoder should be able to handle it coder = coders_registry.get_coder(Person) roundtripped = pickler.loads(pickler.dumps(coder)) self.assertEqual(roundtripped, coder) def test_row_coder_in_pipeine(self): with TestPipeline() as p: res = ( p | beam.Create(self.PEOPLE) | beam.Filter(lambda person: person.name == "Jon Snow")) assert_that(res, equal_to([self.JON_SNOW])) def test_row_coder_nested_struct(self): Pair = typing.NamedTuple('Pair', [('left', Person), ('right', Person)]) value = Pair(self.PEOPLE[0], self.PEOPLE[1]) coder = RowCoder(typing_to_runner_api(Pair).row_type.schema) self.assertEqual(value, coder.decode(coder.encode(value))) def test_row_coder_fail_early_bad_schema(self): schema_proto = schema_pb2.Schema( fields=[ schema_pb2.Field( name="type_with_no_typeinfo", type=schema_pb2.FieldType()) ]) # Should raise an exception referencing the problem field self.assertRaisesRegex( ValueError, "type_with_no_typeinfo", lambda: RowCoder(schema_proto))
def from_runner_api_parameter(fn_parameter, unused_context): # type: (...) -> SlidingWindows return SlidingWindows( size=Duration(micros=fn_parameter.size.ToMicroseconds()), offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()), period=Duration(micros=fn_parameter.period.ToMicroseconds()))
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = (p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True, n=10) df_expected = pd.DataFrame( { 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
class StandardCodersTest(unittest.TestCase): _urn_to_json_value_parser = { 'beam:coder:bytes:v1': lambda x: x.encode('utf-8'), 'beam:coder:string_utf8:v1': lambda x: x, 'beam:coder:varint:v1': lambda x: x, 'beam:coder:kv:v1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'beam:coder:interval_window:v1': lambda x: IntervalWindow( start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'beam:coder:iterable:v1': lambda x, parser: list(map(parser, x)), 'beam:coder:global_window:v1': lambda x: window.GlobalWindow(), 'beam:coder:windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])), 'beam:coder:timer:v1': lambda x, payload_parser: dict( payload=payload_parser(x['payload']), timestamp=Timestamp(micros=x['timestamp'] * 1000)), 'beam:coder:double:v1': parse_float, } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): def assert_equal(actual, expected): """Handle nan values which self.assertEqual fails on.""" if (isinstance(actual, float) and isinstance(expected, float) and math.isnan(actual) and math.isnan(expected)): return self.assertEqual(actual, expected) coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) decoded = decode_nested(coder, expected_encoded, nested) assert_equal(decoded, value) else: # Only verify decoding for a non-deterministic coder self.assertEqual(decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): context = pipeline_context.PipelineContext() coder_id = str(hash(str(spec))) component_ids = [context.coders.get_id(self.parse_coder(c)) for c in spec.get('components', ())] context.coders.put_proto(coder_id, beam_runner_api_pb2.Coder( spec=beam_runner_api_pb2.FunctionSpec( urn=spec['urn'], payload=spec.get('payload')), component_coder_ids=component_ids)) return context.coders.get_by_id(coder_id) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ())] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print("FIXING", len(cls.to_fix), "TESTS") doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print(quote(expected_encoded), "->", quote(actual_encoded)) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
def test_timestamps(self): wv = windowed_value.WindowedValue(None, 3, ()) self.assertEqual(wv.timestamp, Timestamp.of(3)) self.assertTrue(wv.timestamp is wv.timestamp) self.assertEqual(windowed_value.WindowedValue(None, -2.5, ()).timestamp, Timestamp.of(-2.5))
def test_from_rfc3339_failure(self): with self.assertRaisesRegexp(ValueError, 'parse'): Timestamp.from_rfc3339('not rfc3339') with self.assertRaisesRegexp(ValueError, 'parse'): Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z unparseable')
class StandardCodersTest(unittest.TestCase): _urn_to_coder_class = { 'urn:beam:coders:bytes:0.1': coders.BytesCoder, 'urn:beam:coders:varint:0.1': coders.VarIntCoder, 'urn:beam:coders:kv:0.1': lambda k, v: coders.TupleCoder((k, v)), 'urn:beam:coders:interval_window:0.1': coders.IntervalWindowCoder, 'urn:beam:coders:stream:0.1': lambda t: coders.IterableCoder(t), 'urn:beam:coders:global_window:0.1': coders.GlobalWindowCoder, 'urn:beam:coders:windowed_value:0.1': lambda v, w: coders.WindowedValueCoder(v, w) } _urn_to_json_value_parser = { 'urn:beam:coders:bytes:0.1': lambda x: x, 'urn:beam:coders:varint:0.1': lambda x: x, 'urn:beam:coders:kv:0.1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'urn:beam:coders:interval_window:0.1': lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'urn:beam:coders:stream:0.1': lambda x, parser: map(parser, x), 'urn:beam:coders:global_window:0.1': lambda x: window.GlobalWindow(), 'urn:beam:coders:windowed_value:0.1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])) } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) self.assertEqual( decode_nested(coder, expected_encoded, nested), value) else: # Only verify decoding for a non-deterministic coder self.assertEqual( decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): return self._urn_to_coder_class[spec['urn']]( *[self.parse_coder(c) for c in spec.get('components', ())]) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ()) ] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print "FIXING", len(cls.to_fix), "TESTS" doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print quote(expected_encoded), "->", quote(actual_encoded) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
def start(self): # type: () -> Timestamp if self._start_object is None: self._start_object = Timestamp(0, self._start_micros) return self._start_object
def end(self): # type: () -> Timestamp if self._end_object is None: self._end_object = Timestamp(0, self._end_micros) return self._end_object
def advance_watermark(self, watermark_secs): record = TestStreamFileRecord(recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event.AdvanceWatermark( new_watermark=Timestamp.of(watermark_secs).micros))) self._records.append(record) return self
def timestamp(self): # type: () -> Timestamp if self.timestamp_object is None: self.timestamp_object = Timestamp(0, self.timestamp_micros) return self.timestamp_object