Exemple #1
0
 def test_from_rfc3339(self):
   test_cases = [
       (10000000, '1970-04-26T17:46:40Z'),
       (10000000.000001, '1970-04-26T17:46:40.000001Z'),
       (1458343379.123456, '2016-03-18T23:22:59.123456Z'),
   ]
   for seconds_float, rfc3339_str in test_cases:
     self.assertEqual(Timestamp(seconds_float),
                      Timestamp.from_rfc3339(rfc3339_str))
     self.assertEqual(rfc3339_str,
                      Timestamp.from_rfc3339(rfc3339_str).to_rfc3339())
Exemple #2
0
 def test_from_rfc3339(self):
     test_cases = [
         (10000000, '1970-04-26T17:46:40Z'),
         (10000000.000001, '1970-04-26T17:46:40.000001Z'),
         (1458343379.123456, '2016-03-18T23:22:59.123456Z'),
     ]
     for seconds_float, rfc3339_str in test_cases:
         self.assertEqual(Timestamp(seconds_float),
                          Timestamp.from_rfc3339(rfc3339_str))
         self.assertEqual(rfc3339_str,
                          Timestamp.from_rfc3339(rfc3339_str).to_rfc3339())
      def _get_element(message):
        parsed_message = PubsubMessage._from_message(message)
        if (timestamp_attribute and
            timestamp_attribute in parsed_message.attributes):
          rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError:
            try:
              timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
            except ValueError as e:
              raise ValueError('Bad timestamp value: %s' % e)
        else:
          timestamp = Timestamp.from_rfc3339(message.service_timestamp)

        return timestamp, parsed_message
Exemple #4
0
 def test_from_rfc3339_with_timezone(self):
     test_cases = [
         (1458328979.123456, '2016-03-18T23:22:59.123456+04:00'),
         (1458357779.123456, '2016-03-18T23:22:59.123456-04:00'),
     ]
     for seconds_float, rfc3339_str in test_cases:
         self.assertEqual(Timestamp(seconds_float),
                          Timestamp.from_rfc3339(rfc3339_str))
      def _get_element(message):
        parsed_message = PubsubMessage._from_message(message)
        if timestamp_attribute:
          try:
            rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
          except KeyError as e:
            raise KeyError('Timestamp attribute not found: %s' % e)
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError:
            try:
              timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
            except ValueError as e:
              raise ValueError('Bad timestamp value: %s' % e)
        else:
          timestamp = Timestamp.from_rfc3339(message.service_timestamp)

        return timestamp, parsed_message
    def _get_element(message):
      parsed_message = PubsubMessage._from_message(message)
      if (timestamp_attribute and
          timestamp_attribute in parsed_message.attributes):
        rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
        try:
          timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
        except ValueError:
          try:
            timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
          except ValueError as e:
            raise ValueError('Bad timestamp value: %s' % e)
      else:
        timestamp = Timestamp(message.publish_time.seconds,
                              message.publish_time.nanos // 1000)

      return timestamp, parsed_message
Exemple #7
0
    def _get_element(message):
      parsed_message = PubsubMessage._from_message(message)
      if (timestamp_attribute and
          timestamp_attribute in parsed_message.attributes):
        rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
        try:
          timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
        except ValueError:
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError as e:
            raise ValueError('Bad timestamp value: %s' % e)
      else:
        if message.publish_time is None:
          raise ValueError('No publish time present in message: %s' % message)
        timestamp = Timestamp.from_utc_datetime(message.publish_time)

      return timestamp, parsed_message
Exemple #8
0
 def test_from_rfc3339_failure(self):
     with self.assertRaisesRegex(ValueError, 'parse'):
         Timestamp.from_rfc3339('not rfc3339')
     with self.assertRaisesRegex(ValueError, 'parse'):
         Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z unparseable')
Exemple #9
0
class RowCoderTest(unittest.TestCase):
    JON_SNOW = Person(
        name="Jon Snow",
        age=np.int32(23),
        address=None,
        aliases=["crow", "wildling"],
        knows_javascript=False,
        payload=None,
        custom_metadata={},
        favorite_time=Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z'),
    )
    PEOPLE = [
        JON_SNOW,
        Person(
            "Daenerys Targaryen",
            np.int32(25),
            "Westeros",
            ["Mother of Dragons"],
            False,
            None,
            {"dragons": 3},
            Timestamp.from_rfc3339('1970-04-26T17:46:40Z'),
        ),
        Person("Michael Bluth", np.int32(30), None, [], True,
               b"I've made a huge mistake", {},
               Timestamp.from_rfc3339('2020-08-12T15:51:00.032Z'))
    ]

    def test_create_row_coder_from_named_tuple(self):
        expected_coder = RowCoder(typing_to_runner_api(Person).row_type.schema)
        real_coder = coders_registry.get_coder(Person)

        for test_case in self.PEOPLE:
            self.assertEqual(expected_coder.encode(test_case),
                             real_coder.encode(test_case))

            self.assertEqual(test_case,
                             real_coder.decode(real_coder.encode(test_case)))

    def test_create_row_coder_from_schema(self):
        schema = schema_pb2.Schema(
            id="person",
            fields=[
                schema_pb2.Field(
                    name="name",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)),
                schema_pb2.Field(
                    name="age",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)),
                schema_pb2.Field(name="address",
                                 type=schema_pb2.FieldType(
                                     atomic_type=schema_pb2.STRING,
                                     nullable=True)),
                schema_pb2.Field(
                    name="aliases",
                    type=schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
                        element_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING)))),
                schema_pb2.Field(
                    name="knows_javascript",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)),
                schema_pb2.Field(name="payload",
                                 type=schema_pb2.FieldType(
                                     atomic_type=schema_pb2.BYTES,
                                     nullable=True)),
                schema_pb2.Field(
                    name="custom_metadata",
                    type=schema_pb2.FieldType(map_type=schema_pb2.MapType(
                        key_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING),
                        value_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.INT64),
                    ))),
                schema_pb2.Field(
                    name="favorite_time",
                    type=schema_pb2.
                    FieldType(logical_type=schema_pb2.LogicalType(
                        urn="beam:logical_type:micros_instant:v1",
                        representation=schema_pb2.FieldType(
                            row_type=schema_pb2.RowType(
                                schema=schema_pb2.Schema(
                                    id="micros_instant",
                                    fields=[
                                        schema_pb2.Field(
                                            name="seconds",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                        schema_pb2.Field(
                                            name="micros",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                    ])))))),
            ])
        coder = RowCoder(schema)

        for test_case in self.PEOPLE:
            self.assertEqual(test_case, coder.decode(coder.encode(test_case)))

    @unittest.skip(
        "BEAM-8030 - Overflow behavior in VarIntCoder is currently inconsistent"
    )
    def test_overflows(self):
        IntTester = typing.NamedTuple(
            'IntTester',
            [
                # TODO(BEAM-7996): Test int8 and int16 here as well when those
                # types are supported
                # ('i8', typing.Optional[np.int8]),
                # ('i16', typing.Optional[np.int16]),
                ('i32', typing.Optional[np.int32]),
                ('i64', typing.Optional[np.int64]),
            ])

        c = RowCoder.from_type_hint(IntTester, None)

        no_overflow = chain(
            (IntTester(i32=i, i64=None) for i in (-2**31, 2**31 - 1)),
            (IntTester(i32=None, i64=i) for i in (-2**63, 2**63 - 1)),
        )

        # Encode max/min ints to make sure they don't throw any error
        for case in no_overflow:
            c.encode(case)

        overflow = chain(
            (IntTester(i32=i, i64=None) for i in (-2**31 - 1, 2**31)),
            (IntTester(i32=None, i64=i) for i in (-2**63 - 1, 2**63)),
        )

        # Encode max+1/min-1 ints to make sure they DO throw an error
        for case in overflow:
            self.assertRaises(OverflowError, lambda: c.encode(case))

    def test_none_in_non_nullable_field_throws(self):
        Test = typing.NamedTuple('Test', [('foo', unicode)])

        c = RowCoder.from_type_hint(Test, None)
        self.assertRaises(ValueError, lambda: c.encode(Test(foo=None)))

    def test_schema_remove_column(self):
        fields = [("field1", unicode), ("field2", unicode)]
        # new schema is missing one field that was in the old schema
        Old = typing.NamedTuple('Old', fields)
        New = typing.NamedTuple('New', fields[:-1])

        old_coder = RowCoder.from_type_hint(Old, None)
        new_coder = RowCoder.from_type_hint(New, None)

        self.assertEqual(New("foo"),
                         new_coder.decode(old_coder.encode(Old("foo", "bar"))))

    def test_schema_add_column(self):
        fields = [("field1", unicode), ("field2", typing.Optional[unicode])]
        # new schema has one (optional) field that didn't exist in the old schema
        Old = typing.NamedTuple('Old', fields[:-1])
        New = typing.NamedTuple('New', fields)

        old_coder = RowCoder.from_type_hint(Old, None)
        new_coder = RowCoder.from_type_hint(New, None)

        self.assertEqual(New("bar", None),
                         new_coder.decode(old_coder.encode(Old("bar"))))

    def test_schema_add_column_with_null_value(self):
        fields = [("field1", typing.Optional[unicode]), ("field2", unicode),
                  ("field3", typing.Optional[unicode])]
        # new schema has one (optional) field that didn't exist in the old schema
        Old = typing.NamedTuple('Old', fields[:-1])
        New = typing.NamedTuple('New', fields)

        old_coder = RowCoder.from_type_hint(Old, None)
        new_coder = RowCoder.from_type_hint(New, None)

        self.assertEqual(New(None, "baz", None),
                         new_coder.decode(old_coder.encode(Old(None, "baz"))))

    def test_row_coder_picklable(self):
        # occasionally coders can get pickled, RowCoder should be able to handle it
        coder = coders_registry.get_coder(Person)
        roundtripped = pickler.loads(pickler.dumps(coder))

        self.assertEqual(roundtripped, coder)

    def test_row_coder_in_pipeine(self):
        with TestPipeline() as p:
            res = (p
                   | beam.Create(self.PEOPLE)
                   | beam.Filter(lambda person: person.name == "Jon Snow"))
            assert_that(res, equal_to([self.JON_SNOW]))

    def test_row_coder_nested_struct(self):
        Pair = typing.NamedTuple('Pair', [('left', Person), ('right', Person)])

        value = Pair(self.PEOPLE[0], self.PEOPLE[1])
        coder = RowCoder(typing_to_runner_api(Pair).row_type.schema)

        self.assertEqual(value, coder.decode(coder.encode(value)))
Exemple #10
0
class RowCoderTest(unittest.TestCase):
  JON_SNOW = Person(
      name="Jon Snow",
      age=np.int32(23),
      address=None,
      aliases=["crow", "wildling"],
      knows_javascript=False,
      payload=None,
      custom_metadata={},
      favorite_time=Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z'),
  )
  PEOPLE = [
      JON_SNOW,
      Person(
          "Daenerys Targaryen",
          np.int32(25),
          "Westeros",
          ["Mother of Dragons"],
          False,
          None,
          {"dragons": 3},
          Timestamp.from_rfc3339('1970-04-26T17:46:40Z'),
      ),
      Person(
          "Michael Bluth",
          np.int32(30),
          None, [],
          True,
          b"I've made a huge mistake", {},
          Timestamp.from_rfc3339('2020-08-12T15:51:00.032Z'))
  ]

  def test_row_accepts_trailing_zeros_truncated(self):
    expected_coder = RowCoder(
        typing_to_runner_api(NullablePerson).row_type.schema)
    person = NullablePerson(
        None,
        np.int32(25),
        "Westeros", ["Mother of Dragons"],
        False,
        None, {"dragons": 3},
        None,
        "NotNull")
    out = expected_coder.encode(person)
    # 9 fields, 1 null byte, field 0, 5, 7 are null
    new_payload = bytes([9, 1, 1 | 1 << 5 | 1 << 7]) + out[4:]
    new_value = expected_coder.decode(new_payload)
    self.assertEqual(person, new_value)

  def test_create_row_coder_from_named_tuple(self):
    expected_coder = RowCoder(typing_to_runner_api(Person).row_type.schema)
    real_coder = coders_registry.get_coder(Person)

    for test_case in self.PEOPLE:
      self.assertEqual(
          expected_coder.encode(test_case), real_coder.encode(test_case))

      self.assertEqual(
          test_case, real_coder.decode(real_coder.encode(test_case)))

  def test_create_row_coder_from_schema(self):
    schema = schema_pb2.Schema(
        id="person",
        fields=[
            schema_pb2.Field(
                name="name",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)),
            schema_pb2.Field(
                name="age",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)),
            schema_pb2.Field(
                name="address",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.STRING, nullable=True)),
            schema_pb2.Field(
                name="aliases",
                type=schema_pb2.FieldType(
                    array_type=schema_pb2.ArrayType(
                        element_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING)))),
            schema_pb2.Field(
                name="knows_javascript",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)),
            schema_pb2.Field(
                name="payload",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.BYTES, nullable=True)),
            schema_pb2.Field(
                name="custom_metadata",
                type=schema_pb2.FieldType(
                    map_type=schema_pb2.MapType(
                        key_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING),
                        value_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.INT64),
                    ))),
            schema_pb2.Field(
                name="favorite_time",
                type=schema_pb2.FieldType(
                    logical_type=schema_pb2.LogicalType(
                        urn="beam:logical_type:micros_instant:v1",
                        representation=schema_pb2.FieldType(
                            row_type=schema_pb2.RowType(
                                schema=schema_pb2.Schema(
                                    id="micros_instant",
                                    fields=[
                                        schema_pb2.Field(
                                            name="seconds",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                        schema_pb2.Field(
                                            name="micros",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                    ])))))),
        ])
    coder = RowCoder(schema)

    for test_case in self.PEOPLE:
      self.assertEqual(test_case, coder.decode(coder.encode(test_case)))

  @unittest.skip(
      "BEAM-8030 - Overflow behavior in VarIntCoder is currently inconsistent")
  def test_overflows(self):
    IntTester = typing.NamedTuple(
        'IntTester',
        [
            # TODO(BEAM-7996): Test int8 and int16 here as well when those
            # types are supported
            # ('i8', typing.Optional[np.int8]),
            # ('i16', typing.Optional[np.int16]),
            ('i32', typing.Optional[np.int32]),
            ('i64', typing.Optional[np.int64]),
        ])

    c = RowCoder.from_type_hint(IntTester, None)

    no_overflow = chain(
        (IntTester(i32=i, i64=None) for i in (-2**31, 2**31 - 1)),
        (IntTester(i32=None, i64=i) for i in (-2**63, 2**63 - 1)),
    )

    # Encode max/min ints to make sure they don't throw any error
    for case in no_overflow:
      c.encode(case)

    overflow = chain(
        (IntTester(i32=i, i64=None) for i in (-2**31 - 1, 2**31)),
        (IntTester(i32=None, i64=i) for i in (-2**63 - 1, 2**63)),
    )

    # Encode max+1/min-1 ints to make sure they DO throw an error
    # pylint: disable=cell-var-from-loop
    for case in overflow:
      self.assertRaises(OverflowError, lambda: c.encode(case))

  def test_none_in_non_nullable_field_throws(self):
    Test = typing.NamedTuple('Test', [('foo', str)])

    c = RowCoder.from_type_hint(Test, None)
    self.assertRaises(ValueError, lambda: c.encode(Test(foo=None)))

  def test_schema_remove_column(self):
    fields = [("field1", str), ("field2", str)]
    # new schema is missing one field that was in the old schema
    Old = typing.NamedTuple('Old', fields)
    New = typing.NamedTuple('New', fields[:-1])

    old_coder = RowCoder.from_type_hint(Old, None)
    new_coder = RowCoder.from_type_hint(New, None)

    self.assertEqual(
        New("foo"), new_coder.decode(old_coder.encode(Old("foo", "bar"))))

  def test_schema_add_column(self):
    fields = [("field1", str), ("field2", typing.Optional[str])]
    # new schema has one (optional) field that didn't exist in the old schema
    Old = typing.NamedTuple('Old', fields[:-1])
    New = typing.NamedTuple('New', fields)

    old_coder = RowCoder.from_type_hint(Old, None)
    new_coder = RowCoder.from_type_hint(New, None)

    self.assertEqual(
        New("bar", None), new_coder.decode(old_coder.encode(Old("bar"))))

  def test_schema_add_column_with_null_value(self):
    fields = [("field1", typing.Optional[str]), ("field2", str),
              ("field3", typing.Optional[str])]
    # new schema has one (optional) field that didn't exist in the old schema
    Old = typing.NamedTuple('Old', fields[:-1])
    New = typing.NamedTuple('New', fields)

    old_coder = RowCoder.from_type_hint(Old, None)
    new_coder = RowCoder.from_type_hint(New, None)

    self.assertEqual(
        New(None, "baz", None),
        new_coder.decode(old_coder.encode(Old(None, "baz"))))

  def test_row_coder_picklable(self):
    # occasionally coders can get pickled, RowCoder should be able to handle it
    coder = coders_registry.get_coder(Person)
    roundtripped = pickler.loads(pickler.dumps(coder))

    self.assertEqual(roundtripped, coder)

  def test_row_coder_in_pipeine(self):
    with TestPipeline() as p:
      res = (
          p
          | beam.Create(self.PEOPLE)
          | beam.Filter(lambda person: person.name == "Jon Snow"))
      assert_that(res, equal_to([self.JON_SNOW]))

  def test_row_coder_nested_struct(self):
    Pair = typing.NamedTuple('Pair', [('left', Person), ('right', Person)])

    value = Pair(self.PEOPLE[0], self.PEOPLE[1])
    coder = RowCoder(typing_to_runner_api(Pair).row_type.schema)

    self.assertEqual(value, coder.decode(coder.encode(value)))

  def test_encoding_position_reorder_fields(self):
    schema1 = schema_pb2.Schema(
        id="reorder_test_schema1",
        fields=[
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
            ),
        ])
    schema2 = schema_pb2.Schema(
        id="reorder_test_schema2",
        encoding_positions_set=True,
        fields=[
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=1,
            ),
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0,
            ),
        ])

    RowSchema1 = named_tuple_from_schema(schema1)
    RowSchema2 = named_tuple_from_schema(schema2)
    roundtripped = RowCoder(schema2).decode(
        RowCoder(schema1).encode(RowSchema1(42, "Hello World!")))

    self.assertEqual(RowSchema2(f_int32=42, f_str="Hello World!"), roundtripped)

  def test_encoding_position_add_fields_and_reorder(self):
    old_schema = schema_pb2.Schema(
        id="add_test_old",
        fields=[
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
            ),
        ])
    new_schema = schema_pb2.Schema(
        encoding_positions_set=True,
        id="add_test_new",
        fields=[
            schema_pb2.Field(
                name="f_new_str",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.STRING, nullable=True),
                encoding_position=2,
            ),
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0,
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=1,
            ),
        ])

    Old = named_tuple_from_schema(old_schema)
    New = named_tuple_from_schema(new_schema)
    roundtripped = RowCoder(new_schema).decode(
        RowCoder(old_schema).encode(Old(42, "Hello World!")))

    self.assertEqual(
        New(f_new_str=None, f_int32=42, f_str="Hello World!"), roundtripped)

  def test_row_coder_fail_early_bad_schema(self):
    schema_proto = schema_pb2.Schema(
        fields=[
            schema_pb2.Field(
                name="type_with_no_typeinfo", type=schema_pb2.FieldType())
        ])

    # Should raise an exception referencing the problem field
    self.assertRaisesRegex(
        ValueError, "type_with_no_typeinfo", lambda: RowCoder(schema_proto))

  def test_row_coder_cloud_object_schema(self):
    schema_proto = schema_pb2.Schema()
    schema_proto_json = json_format.MessageToJson(schema_proto).encode('utf-8')

    coder = RowCoder(schema_proto)

    cloud_object = coder.as_cloud_object()

    self.assertEqual(schema_proto_json, cloud_object['schema'])
Exemple #11
0
 def test_from_rfc3339_failure(self):
   with self.assertRaisesRegexp(ValueError, 'parse'):
     Timestamp.from_rfc3339('not rfc3339')
   with self.assertRaisesRegexp(ValueError, 'parse'):
     Timestamp.from_rfc3339('2016-03-18T23:22:59.123456Z unparseable')