Esempio n. 1
0
 def test_from_utc_datetime(self):
   self.assertEqual(
       Timestamp.from_utc_datetime(datetime.datetime(1970, 1, 1,
                                                     tzinfo=pytz.utc)),
       Timestamp(0))
   with self.assertRaisesRegexp(ValueError, r'UTC'):
     Timestamp.from_utc_datetime(datetime.datetime(1970, 1, 1))
Esempio n. 2
0
 def test_from_rfc3339(self):
   test_cases = [
       (10000000, '1970-04-26T17:46:40Z'),
       (10000000.000001, '1970-04-26T17:46:40.000001Z'),
       (1458343379.123456, '2016-03-18T23:22:59.123456Z'),
   ]
   for seconds_float, rfc3339_str in test_cases:
     self.assertEqual(Timestamp(seconds_float),
                      Timestamp.from_rfc3339(rfc3339_str))
     self.assertEqual(rfc3339_str,
                      Timestamp.from_rfc3339(rfc3339_str).to_rfc3339())
  def finish_bundle(self):
    data = self._read_from_pubsub(self.source.timestamp_attribute)
    if data:
      output_pcollection = list(self._outputs)[0]
      bundle = self._evaluation_context.create_bundle(output_pcollection)
      # TODO(ccy): Respect the PubSub source's id_label field.
      for timestamp, message in data:
        if self.source.with_attributes:
          element = message
        else:
          element = message.data
        bundle.output(
            GlobalWindows.windowed_value(element, timestamp=timestamp))
      bundles = [bundle]
    else:
      bundles = []
    if self._applied_ptransform.inputs:
      input_pvalue = self._applied_ptransform.inputs[0]
    else:
      input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
    unprocessed_bundle = self._evaluation_context.create_bundle(
        input_pvalue)

    # TODO(udim): Correct value for watermark hold.
    return TransformResult(self, bundles, [unprocessed_bundle], None,
                           {None: Timestamp.of(time.time())})
      def _get_element(message):
        parsed_message = PubsubMessage._from_message(message)
        if timestamp_attribute:
          try:
            rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
          except KeyError as e:
            raise KeyError('Timestamp attribute not found: %s' % e)
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError:
            try:
              timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
            except ValueError as e:
              raise ValueError('Bad timestamp value: %s' % e)
        else:
          timestamp = Timestamp.from_rfc3339(message.service_timestamp)

        return timestamp, parsed_message
Esempio n. 5
0
 def __init__(self, start, end):
   if start is not None or end is not None:
     self._start_object = Timestamp.of(start)
     self._end_object = Timestamp.of(end)
     try:
       self._start_micros = self._start_object.micros
     except OverflowError:
       self._start_micros = (
           MIN_TIMESTAMP.micros if self._start_object.micros < 0
           else MAX_TIMESTAMP.micros)
     try:
       self._end_micros = self._end_object.micros
     except OverflowError:
       self._end_micros = (
           MIN_TIMESTAMP.micros if self._end_object.micros < 0
           else MAX_TIMESTAMP.micros)
   else:
     # Micros must be populated elsewhere.
     self._start_object = self._end_object = None
Esempio n. 6
0
 def __init__(self, value, timestamp, windows):
   # For performance reasons, only timestamp_micros is stored by default
   # (as a C int). The Timestamp object is created on demand below.
   self.value = value
   if isinstance(timestamp, int):
     self.timestamp_micros = timestamp * 1000000
   else:
     self.timestamp_object = (timestamp if isinstance(timestamp, Timestamp)
                              else Timestamp.of(timestamp))
     self.timestamp_micros = self.timestamp_object.micros
   self.windows = windows
Esempio n. 7
0
 def finish_bundle(self):
   data = self._read_from_pubsub()
   if data:
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     # TODO(ccy): we currently do not use the PubSub message timestamp or
     # respect the PubSub source's id_label field.
     now = Timestamp.of(time.time())
     for message_data in data:
       bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now))
     bundles = [bundle]
   else:
     bundles = []
   if self._applied_ptransform.inputs:
     input_pvalue = self._applied_ptransform.inputs[0]
   else:
     input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
   unprocessed_bundle = self._evaluation_context.create_bundle(
       input_pvalue)
   return TransformResult(
       self._applied_ptransform, bundles,
       [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
    def _get_element(message):
      parsed_message = PubsubMessage._from_message(message)
      if (timestamp_attribute and
          timestamp_attribute in parsed_message.attributes):
        rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
        try:
          timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
        except ValueError:
          try:
            timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
          except ValueError as e:
            raise ValueError('Bad timestamp value: %s' % e)
      else:
        timestamp = Timestamp(message.publish_time.seconds,
                              message.publish_time.nanos // 1000)

      return timestamp, parsed_message
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        # TODO(BEAM-9322): Remove use of this experiment.
        # This flag is only necessary when using the multi-output TestStream b/c
        # it relies on using the PCollection output tags as the PCollection output
        # ids.
        options = StandardOptions(streaming=True)
        options.view_as(DebugOptions).add_experiment(
            'passthrough_pcollection_output_ids')

        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
Esempio n. 10
0
 def test_of(self):
   interval = Timestamp(123)
   self.assertEqual(id(interval), id(Timestamp.of(interval)))
   self.assertEqual(interval, Timestamp.of(123.0))
   with self.assertRaises(TypeError):
     Timestamp.of(Duration(10))
Esempio n. 11
0
 def _getTimestampFromProto():
     # type: () -> Timestamp
     ts_millis = int(
         common_urns.constants.GLOBAL_WINDOW_MAX_TIMESTAMP_MILLIS.constant)
     return Timestamp(micros=ts_millis * 1000)
Esempio n. 12
0
 def __init__(self, end):
     # type: (TimestampTypes) -> None
     self._end = Timestamp.of(end)
Esempio n. 13
0
 def __init__(self, processing_time, watermark):
   self._processing_time = Timestamp.of(processing_time)
   self._watermark = Timestamp.of(watermark)
Esempio n. 14
0
 def __init__(self, value, timestamp):
     self.value = value
     self.timestamp = Timestamp.of(timestamp)
Esempio n. 15
0
 def __init__(self, start, end):
   super(IntervalWindow, self).__init__(end)
   self.start = Timestamp.of(start)
Esempio n. 16
0
 def test_from_proto_fails_with_truncation(self):
     # TODO(https://github.com/apache/beam/issues/19922): Better define
     # timestamps.
     with self.assertRaises(ValueError):
         Timestamp.from_proto(
             timestamp_pb2.Timestamp(seconds=1234, nanos=56789))
Esempio n. 17
0
 def test_now(self):
     now = Timestamp.now()
     self.assertTrue(isinstance(now, Timestamp))
Esempio n. 18
0
 def to_language_type(self, value):
     # type: (MicrosInstantRepresentation) -> Timestamp
     return Timestamp(seconds=int(value.seconds), micros=int(value.micros))
Esempio n. 19
0
 def from_runner_api_parameter(fn_parameter, unused_context):
     return FixedWindows(
         size=Duration(micros=fn_parameter.size.ToMicroseconds()),
         offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()))
Esempio n. 20
0
 def __init__(self, end):
     self.end = Timestamp.of(end)
Esempio n. 21
0
 def __init__(self, size, period, offset=0):
     if size <= 0:
         raise ValueError('The size parameter must be strictly positive.')
     self.size = Duration.of(size)
     self.period = Duration.of(period)
     self.offset = Timestamp.of(offset) % period
Esempio n. 22
0
 def test_to_proto(self):
     ts = Timestamp(seconds=1234, micros=56)
     actual_ts_proto = Timestamp.to_proto(ts)
     expected_ts_proto = timestamp_pb2.Timestamp(seconds=1234, nanos=56000)
     self.assertEqual(actual_ts_proto, expected_ts_proto)
Esempio n. 23
0
 def __init__(self, timestamp, element=None, window=None):
   self.timestamp = Timestamp.of(timestamp)
   self.element = element
   self.window = window
Esempio n. 24
0
 def test_of(self):
     interval = Timestamp(123)
     self.assertEqual(id(interval), id(Timestamp.of(interval)))
     self.assertEqual(interval, Timestamp.of(123.0))
     with self.assertRaises(TypeError):
         Timestamp.of(Duration(10))
Esempio n. 25
0
 def __init__(self, size, period, offset=0):
   if size <= 0:
     raise ValueError('The size parameter must be strictly positive.')
   self.size = Duration.of(size)
   self.period = Duration.of(period)
   self.offset = Timestamp.of(offset) % period
Esempio n. 26
0
 def test_from_rfc3339_failure(self):
     with self.assertRaisesRegex(ValueError, 'parse'):
         Timestamp.from_rfc3339('not rfc3339')
     with self.assertRaisesRegex(ValueError, 'parse'):
         Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z unparseable')
Esempio n. 27
0
 def __init__(self, new_watermark, tag=None):
   self.new_watermark = Timestamp.of(new_watermark)
   self.tag = tag
Esempio n. 28
0
 def test_from_utc_datetime(self):
     self.assertEqual(
         Timestamp.from_utc_datetime(
             datetime.datetime(1970, 1, 1, tzinfo=pytz.utc)), Timestamp(0))
     with self.assertRaisesRegex(ValueError, r'UTC'):
         Timestamp.from_utc_datetime(datetime.datetime(1970, 1, 1))
Esempio n. 29
0
 def decode_from_stream(self, in_stream, nested):
   return Timestamp(micros=in_stream.read_bigendian_int64())
Esempio n. 30
0
    def test_arithmetic(self):
        # Supported operations.
        self.assertEqual(Timestamp(123) + 456, 579)
        self.assertEqual(Timestamp(123) + Duration(456), 579)
        self.assertEqual(456 + Timestamp(123), 579)
        self.assertEqual(Duration(456) + Timestamp(123), 579)
        self.assertEqual(Timestamp(123) - 456, -333)
        self.assertEqual(Timestamp(123) - Duration(456), -333)
        self.assertEqual(Timestamp(1230) % 456, 318)
        self.assertEqual(Timestamp(1230) % Duration(456), 318)
        self.assertEqual(Timestamp(123) - Timestamp(100), 23)

        # Check that direct comparison of Timestamp and Duration is allowed.
        self.assertTrue(Duration(123) == Timestamp(123))
        self.assertTrue(Timestamp(123) == Duration(123))
        self.assertFalse(Duration(123) == Timestamp(1230))
        self.assertFalse(Timestamp(123) == Duration(1230))

        # Check return types.
        self.assertEqual((Timestamp(123) + 456).__class__, Timestamp)
        self.assertEqual((Timestamp(123) + Duration(456)).__class__, Timestamp)
        self.assertEqual((456 + Timestamp(123)).__class__, Timestamp)
        self.assertEqual((Duration(456) + Timestamp(123)).__class__, Timestamp)
        self.assertEqual((Timestamp(123) - 456).__class__, Timestamp)
        self.assertEqual((Timestamp(123) - Duration(456)).__class__, Timestamp)
        self.assertEqual((Timestamp(1230) % 456).__class__, Duration)
        self.assertEqual((Timestamp(1230) % Duration(456)).__class__, Duration)
        self.assertEqual((Timestamp(123) - Timestamp(100)).__class__, Duration)

        # Unsupported operations.
        with self.assertRaises(TypeError):
            self.assertEqual(Timestamp(123) * 456, 56088)
        with self.assertRaises(TypeError):
            self.assertEqual(Timestamp(123) * Duration(456), 56088)
        with self.assertRaises(TypeError):
            self.assertEqual(456 * Timestamp(123), 56088)
        with self.assertRaises(TypeError):
            self.assertEqual(Duration(456) * Timestamp(123), 56088)
        with self.assertRaises(TypeError):
            self.assertEqual(456 - Timestamp(123), 333)
        with self.assertRaises(TypeError):
            self.assertEqual(Duration(456) - Timestamp(123), 333)
        with self.assertRaises(TypeError):
            self.assertEqual(-Timestamp(123), -123)  # pylint: disable=invalid-unary-operand-type
        with self.assertRaises(TypeError):
            self.assertEqual(-Timestamp(123), -Duration(123))  # pylint: disable=invalid-unary-operand-type
        with self.assertRaises(TypeError):
            self.assertEqual(1230 % Timestamp(456), 318)
        with self.assertRaises(TypeError):
            self.assertEqual(Duration(1230) % Timestamp(456), 318)
Esempio n. 31
0
 def __init__(self, value, timestamp):
     # type: (Any, TimestampTypes) -> None
     self.value = value
     self.timestamp = Timestamp.of(timestamp)
Esempio n. 32
0
class RowCoderTest(unittest.TestCase):
  JON_SNOW = Person(
      name="Jon Snow",
      age=np.int32(23),
      address=None,
      aliases=["crow", "wildling"],
      knows_javascript=False,
      payload=None,
      custom_metadata={},
      favorite_time=Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z'),
  )
  PEOPLE = [
      JON_SNOW,
      Person(
          "Daenerys Targaryen",
          np.int32(25),
          "Westeros",
          ["Mother of Dragons"],
          False,
          None,
          {"dragons": 3},
          Timestamp.from_rfc3339('1970-04-26T17:46:40Z'),
      ),
      Person(
          "Michael Bluth",
          np.int32(30),
          None, [],
          True,
          b"I've made a huge mistake", {},
          Timestamp.from_rfc3339('2020-08-12T15:51:00.032Z'))
  ]

  def test_create_row_coder_from_named_tuple(self):
    expected_coder = RowCoder(typing_to_runner_api(Person).row_type.schema)
    real_coder = coders_registry.get_coder(Person)

    for test_case in self.PEOPLE:
      self.assertEqual(
          expected_coder.encode(test_case), real_coder.encode(test_case))

      self.assertEqual(
          test_case, real_coder.decode(real_coder.encode(test_case)))

  def test_create_row_coder_from_schema(self):
    schema = schema_pb2.Schema(
        id="person",
        fields=[
            schema_pb2.Field(
                name="name",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)),
            schema_pb2.Field(
                name="age",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)),
            schema_pb2.Field(
                name="address",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.STRING, nullable=True)),
            schema_pb2.Field(
                name="aliases",
                type=schema_pb2.FieldType(
                    array_type=schema_pb2.ArrayType(
                        element_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING)))),
            schema_pb2.Field(
                name="knows_javascript",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)),
            schema_pb2.Field(
                name="payload",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.BYTES, nullable=True)),
            schema_pb2.Field(
                name="custom_metadata",
                type=schema_pb2.FieldType(
                    map_type=schema_pb2.MapType(
                        key_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING),
                        value_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.INT64),
                    ))),
            schema_pb2.Field(
                name="favorite_time",
                type=schema_pb2.FieldType(
                    logical_type=schema_pb2.LogicalType(
                        urn="beam:logical_type:micros_instant:v1",
                        representation=schema_pb2.FieldType(
                            row_type=schema_pb2.RowType(
                                schema=schema_pb2.Schema(
                                    id="micros_instant",
                                    fields=[
                                        schema_pb2.Field(
                                            name="seconds",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                        schema_pb2.Field(
                                            name="micros",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                    ])))))),
        ])
    coder = RowCoder(schema)

    for test_case in self.PEOPLE:
      self.assertEqual(test_case, coder.decode(coder.encode(test_case)))

  @unittest.skip(
      "BEAM-8030 - Overflow behavior in VarIntCoder is currently inconsistent")
  def test_overflows(self):
    IntTester = typing.NamedTuple(
        'IntTester',
        [
            # TODO(BEAM-7996): Test int8 and int16 here as well when those
            # types are supported
            # ('i8', typing.Optional[np.int8]),
            # ('i16', typing.Optional[np.int16]),
            ('i32', typing.Optional[np.int32]),
            ('i64', typing.Optional[np.int64]),
        ])

    c = RowCoder.from_type_hint(IntTester, None)

    no_overflow = chain(
        (IntTester(i32=i, i64=None) for i in (-2**31, 2**31 - 1)),
        (IntTester(i32=None, i64=i) for i in (-2**63, 2**63 - 1)),
    )

    # Encode max/min ints to make sure they don't throw any error
    for case in no_overflow:
      c.encode(case)

    overflow = chain(
        (IntTester(i32=i, i64=None) for i in (-2**31 - 1, 2**31)),
        (IntTester(i32=None, i64=i) for i in (-2**63 - 1, 2**63)),
    )

    # Encode max+1/min-1 ints to make sure they DO throw an error
    # pylint: disable=cell-var-from-loop
    for case in overflow:
      self.assertRaises(OverflowError, lambda: c.encode(case))

  def test_none_in_non_nullable_field_throws(self):
    Test = typing.NamedTuple('Test', [('foo', str)])

    c = RowCoder.from_type_hint(Test, None)
    self.assertRaises(ValueError, lambda: c.encode(Test(foo=None)))

  def test_schema_remove_column(self):
    fields = [("field1", str), ("field2", str)]
    # new schema is missing one field that was in the old schema
    Old = typing.NamedTuple('Old', fields)
    New = typing.NamedTuple('New', fields[:-1])

    old_coder = RowCoder.from_type_hint(Old, None)
    new_coder = RowCoder.from_type_hint(New, None)

    self.assertEqual(
        New("foo"), new_coder.decode(old_coder.encode(Old("foo", "bar"))))

  def test_schema_add_column(self):
    fields = [("field1", str), ("field2", typing.Optional[str])]
    # new schema has one (optional) field that didn't exist in the old schema
    Old = typing.NamedTuple('Old', fields[:-1])
    New = typing.NamedTuple('New', fields)

    old_coder = RowCoder.from_type_hint(Old, None)
    new_coder = RowCoder.from_type_hint(New, None)

    self.assertEqual(
        New("bar", None), new_coder.decode(old_coder.encode(Old("bar"))))

  def test_schema_add_column_with_null_value(self):
    fields = [("field1", typing.Optional[str]), ("field2", str),
              ("field3", typing.Optional[str])]
    # new schema has one (optional) field that didn't exist in the old schema
    Old = typing.NamedTuple('Old', fields[:-1])
    New = typing.NamedTuple('New', fields)

    old_coder = RowCoder.from_type_hint(Old, None)
    new_coder = RowCoder.from_type_hint(New, None)

    self.assertEqual(
        New(None, "baz", None),
        new_coder.decode(old_coder.encode(Old(None, "baz"))))

  def test_row_coder_picklable(self):
    # occasionally coders can get pickled, RowCoder should be able to handle it
    coder = coders_registry.get_coder(Person)
    roundtripped = pickler.loads(pickler.dumps(coder))

    self.assertEqual(roundtripped, coder)

  def test_row_coder_in_pipeine(self):
    with TestPipeline() as p:
      res = (
          p
          | beam.Create(self.PEOPLE)
          | beam.Filter(lambda person: person.name == "Jon Snow"))
      assert_that(res, equal_to([self.JON_SNOW]))

  def test_row_coder_nested_struct(self):
    Pair = typing.NamedTuple('Pair', [('left', Person), ('right', Person)])

    value = Pair(self.PEOPLE[0], self.PEOPLE[1])
    coder = RowCoder(typing_to_runner_api(Pair).row_type.schema)

    self.assertEqual(value, coder.decode(coder.encode(value)))

  def test_row_coder_fail_early_bad_schema(self):
    schema_proto = schema_pb2.Schema(
        fields=[
            schema_pb2.Field(
                name="type_with_no_typeinfo", type=schema_pb2.FieldType())
        ])

    # Should raise an exception referencing the problem field
    self.assertRaisesRegex(
        ValueError, "type_with_no_typeinfo", lambda: RowCoder(schema_proto))
Esempio n. 33
0
 def from_runner_api_parameter(fn_parameter, unused_context):
     # type: (...) -> SlidingWindows
     return SlidingWindows(
         size=Duration(micros=fn_parameter.size.ToMicroseconds()),
         offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()),
         period=Duration(micros=fn_parameter.period.ToMicroseconds()))
Esempio n. 34
0
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True, n=10)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
Esempio n. 35
0
 def __init__(self, timestamp, element=None, window=None):
     self.timestamp = Timestamp.of(timestamp)
     self.element = element
     self.window = window
class StandardCodersTest(unittest.TestCase):

  _urn_to_json_value_parser = {
      'beam:coder:bytes:v1': lambda x: x.encode('utf-8'),
      'beam:coder:string_utf8:v1': lambda x: x,
      'beam:coder:varint:v1': lambda x: x,
      'beam:coder:kv:v1':
          lambda x, key_parser, value_parser: (key_parser(x['key']),
                                               value_parser(x['value'])),
      'beam:coder:interval_window:v1':
          lambda x: IntervalWindow(
              start=Timestamp(micros=(x['end'] - x['span']) * 1000),
              end=Timestamp(micros=x['end'] * 1000)),
      'beam:coder:iterable:v1': lambda x, parser: list(map(parser, x)),
      'beam:coder:global_window:v1': lambda x: window.GlobalWindow(),
      'beam:coder:windowed_value:v1':
          lambda x, value_parser, window_parser: windowed_value.create(
              value_parser(x['value']), x['timestamp'] * 1000,
              tuple([window_parser(w) for w in x['windows']])),
      'beam:coder:timer:v1':
          lambda x, payload_parser: dict(
              payload=payload_parser(x['payload']),
              timestamp=Timestamp(micros=x['timestamp'] * 1000)),
      'beam:coder:double:v1': parse_float,
  }

  def test_standard_coders(self):
    for name, spec in _load_test_cases(STANDARD_CODERS_YAML):
      logging.info('Executing %s test.', name)
      self._run_standard_coder(name, spec)

  def _run_standard_coder(self, name, spec):
    def assert_equal(actual, expected):
      """Handle nan values which self.assertEqual fails on."""
      if (isinstance(actual, float)
          and isinstance(expected, float)
          and math.isnan(actual)
          and math.isnan(expected)):
        return
      self.assertEqual(actual, expected)

    coder = self.parse_coder(spec['coder'])
    parse_value = self.json_value_parser(spec['coder'])
    nested_list = [spec['nested']] if 'nested' in spec else [True, False]
    for nested in nested_list:
      for expected_encoded, json_value in spec['examples'].items():
        value = parse_value(json_value)
        expected_encoded = expected_encoded.encode('latin1')
        if not spec['coder'].get('non_deterministic', False):
          actual_encoded = encode_nested(coder, value, nested)
          if self.fix and actual_encoded != expected_encoded:
            self.to_fix[spec['index'], expected_encoded] = actual_encoded
          else:
            self.assertEqual(expected_encoded, actual_encoded)
            decoded = decode_nested(coder, expected_encoded, nested)
            assert_equal(decoded, value)
        else:
          # Only verify decoding for a non-deterministic coder
          self.assertEqual(decode_nested(coder, expected_encoded, nested),
                           value)

  def parse_coder(self, spec):
    context = pipeline_context.PipelineContext()
    coder_id = str(hash(str(spec)))
    component_ids = [context.coders.get_id(self.parse_coder(c))
                     for c in spec.get('components', ())]
    context.coders.put_proto(coder_id, beam_runner_api_pb2.Coder(
        spec=beam_runner_api_pb2.FunctionSpec(
            urn=spec['urn'], payload=spec.get('payload')),
        component_coder_ids=component_ids))
    return context.coders.get_by_id(coder_id)

  def json_value_parser(self, coder_spec):
    component_parsers = [
        self.json_value_parser(c) for c in coder_spec.get('components', ())]
    return lambda x: self._urn_to_json_value_parser[coder_spec['urn']](
        x, *component_parsers)

  # Used when --fix is passed.

  fix = False
  to_fix = {}

  @classmethod
  def tearDownClass(cls):
    if cls.fix and cls.to_fix:
      print("FIXING", len(cls.to_fix), "TESTS")
      doc_sep = '\n---\n'
      docs = open(STANDARD_CODERS_YAML).read().split(doc_sep)

      def quote(s):
        return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0')
      for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items():
        print(quote(expected_encoded), "->", quote(actual_encoded))
        docs[doc_ix] = docs[doc_ix].replace(
            quote(expected_encoded) + ':', quote(actual_encoded) + ':')
      open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
Esempio n. 37
0
 def test_timestamps(self):
   wv = windowed_value.WindowedValue(None, 3, ())
   self.assertEqual(wv.timestamp, Timestamp.of(3))
   self.assertTrue(wv.timestamp is wv.timestamp)
   self.assertEqual(windowed_value.WindowedValue(None, -2.5, ()).timestamp,
                    Timestamp.of(-2.5))
Esempio n. 38
0
 def test_from_rfc3339_failure(self):
   with self.assertRaisesRegexp(ValueError, 'parse'):
     Timestamp.from_rfc3339('not rfc3339')
   with self.assertRaisesRegexp(ValueError, 'parse'):
     Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z unparseable')
Esempio n. 39
0
class StandardCodersTest(unittest.TestCase):

    _urn_to_coder_class = {
        'urn:beam:coders:bytes:0.1':
        coders.BytesCoder,
        'urn:beam:coders:varint:0.1':
        coders.VarIntCoder,
        'urn:beam:coders:kv:0.1':
        lambda k, v: coders.TupleCoder((k, v)),
        'urn:beam:coders:interval_window:0.1':
        coders.IntervalWindowCoder,
        'urn:beam:coders:stream:0.1':
        lambda t: coders.IterableCoder(t),
        'urn:beam:coders:global_window:0.1':
        coders.GlobalWindowCoder,
        'urn:beam:coders:windowed_value:0.1':
        lambda v, w: coders.WindowedValueCoder(v, w)
    }

    _urn_to_json_value_parser = {
        'urn:beam:coders:bytes:0.1':
        lambda x: x,
        'urn:beam:coders:varint:0.1':
        lambda x: x,
        'urn:beam:coders:kv:0.1':
        lambda x, key_parser, value_parser:
        (key_parser(x['key']), value_parser(x['value'])),
        'urn:beam:coders:interval_window:0.1':
        lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span'])
                                                 * 1000),
                                 end=Timestamp(micros=x['end'] * 1000)),
        'urn:beam:coders:stream:0.1':
        lambda x, parser: map(parser, x),
        'urn:beam:coders:global_window:0.1':
        lambda x: window.GlobalWindow(),
        'urn:beam:coders:windowed_value:0.1':
        lambda x, value_parser, window_parser: windowed_value.create(
            value_parser(x['value']), x['timestamp'] * 1000,
            tuple([window_parser(w) for w in x['windows']]))
    }

    def test_standard_coders(self):
        for name, spec in _load_test_cases(STANDARD_CODERS_YAML):
            logging.info('Executing %s test.', name)
            self._run_standard_coder(name, spec)

    def _run_standard_coder(self, name, spec):
        coder = self.parse_coder(spec['coder'])
        parse_value = self.json_value_parser(spec['coder'])
        nested_list = [spec['nested']] if 'nested' in spec else [True, False]
        for nested in nested_list:
            for expected_encoded, json_value in spec['examples'].items():
                value = parse_value(json_value)
                expected_encoded = expected_encoded.encode('latin1')
                if not spec['coder'].get('non_deterministic', False):
                    actual_encoded = encode_nested(coder, value, nested)
                    if self.fix and actual_encoded != expected_encoded:
                        self.to_fix[spec['index'],
                                    expected_encoded] = actual_encoded
                    else:
                        self.assertEqual(expected_encoded, actual_encoded)
                        self.assertEqual(
                            decode_nested(coder, expected_encoded, nested),
                            value)
                else:
                    # Only verify decoding for a non-deterministic coder
                    self.assertEqual(
                        decode_nested(coder, expected_encoded, nested), value)

    def parse_coder(self, spec):
        return self._urn_to_coder_class[spec['urn']](
            *[self.parse_coder(c) for c in spec.get('components', ())])

    def json_value_parser(self, coder_spec):
        component_parsers = [
            self.json_value_parser(c)
            for c in coder_spec.get('components', ())
        ]
        return lambda x: self._urn_to_json_value_parser[coder_spec['urn']](
            x, *component_parsers)

    # Used when --fix is passed.

    fix = False
    to_fix = {}

    @classmethod
    def tearDownClass(cls):
        if cls.fix and cls.to_fix:
            print "FIXING", len(cls.to_fix), "TESTS"
            doc_sep = '\n---\n'
            docs = open(STANDARD_CODERS_YAML).read().split(doc_sep)

            def quote(s):
                return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0')

            for (doc_ix,
                 expected_encoded), actual_encoded in cls.to_fix.items():
                print quote(expected_encoded), "->", quote(actual_encoded)
                docs[doc_ix] = docs[doc_ix].replace(
                    quote(expected_encoded) + ':',
                    quote(actual_encoded) + ':')
            open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
Esempio n. 40
0
 def start(self):
     # type: () -> Timestamp
     if self._start_object is None:
         self._start_object = Timestamp(0, self._start_micros)
     return self._start_object
Esempio n. 41
0
 def __init__(self, end):
   self.end = Timestamp.of(end)
Esempio n. 42
0
 def end(self):
     # type: () -> Timestamp
     if self._end_object is None:
         self._end_object = Timestamp(0, self._end_micros)
     return self._end_object
Esempio n. 43
0
 def __init__(self, value, timestamp):
   self.value = value
   self.timestamp = Timestamp.of(timestamp)
 def advance_watermark(self, watermark_secs):
     record = TestStreamFileRecord(recorded_event=TestStreamPayload.Event(
         watermark_event=TestStreamPayload.Event.AdvanceWatermark(
             new_watermark=Timestamp.of(watermark_secs).micros)))
     self._records.append(record)
     return self
Esempio n. 45
0
 def timestamp(self):
     # type: () -> Timestamp
     if self.timestamp_object is None:
         self.timestamp_object = Timestamp(0, self.timestamp_micros)
     return self.timestamp_object
Esempio n. 46
0
 def __init__(self, start, end):
     super(IntervalWindow, self).__init__(end)
     self.start = Timestamp.of(start)