コード例 #1
0
  def test_encoding_position_reorder_fields(self):
    schema1 = schema_pb2.Schema(
        id="reorder_test_schema1",
        fields=[
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
            ),
        ])
    schema2 = schema_pb2.Schema(
        id="reorder_test_schema2",
        encoding_positions_set=True,
        fields=[
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=1,
            ),
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0,
            ),
        ])

    RowSchema1 = named_tuple_from_schema(schema1)
    RowSchema2 = named_tuple_from_schema(schema2)
    roundtripped = RowCoder(schema2).decode(
        RowCoder(schema1).encode(RowSchema1(42, "Hello World!")))

    self.assertEqual(RowSchema2(f_int32=42, f_str="Hello World!"), roundtripped)
コード例 #2
0
 def _get_named_tuple_instance(self):
     schema = named_fields_to_schema([
         (k, convert_to_typing_type(v))
         for k, v in self._transform.__init__.__annotations__.items()
         if k in self._values
     ])
     return named_tuple_from_schema(schema)(**self._values)
コード例 #3
0
    def _get_schema_proto_and_payload(self, *args, **kwargs):
        named_fields = []
        fields_to_values = OrderedDict()
        next_field_id = 0
        for value in args:
            if value is None:
                raise ValueError(
                    'Received value None. None values are currently not supported'
                )
            named_fields.append(
                ((JavaClassLookupPayloadBuilder.IGNORED_ARG_FORMAT %
                  next_field_id),
                 convert_to_typing_type(instance_to_type(value))))
            fields_to_values[(
                JavaClassLookupPayloadBuilder.IGNORED_ARG_FORMAT %
                next_field_id)] = value
            next_field_id += 1
        for key, value in kwargs.items():
            if not key:
                raise ValueError('Parameter name cannot be empty')
            if value is None:
                raise ValueError(
                    'Received value None for key %s. None values are currently not '
                    'supported' % key)
            named_fields.append(
                (key, convert_to_typing_type(instance_to_type(value))))
            fields_to_values[key] = value

        schema_proto = named_fields_to_schema(named_fields)
        row = named_tuple_from_schema(schema_proto)(**fields_to_values)
        schema = named_tuple_to_schema(type(row))

        payload = RowCoder(schema).encode(row)
        return (schema_proto, payload)
コード例 #4
0
    def _get_named_tuple_instance(self):
        # omit fields with value=None since we can't infer their type
        values = {
            key: value
            for key, value in self._values.items() if value is not None
        }

        # In python 2 named_fields_to_schema will not accept str because its
        # ambiguous. This converts str hints to ByteString recursively so its clear
        # we intend to use BYTES.
        # TODO(BEAM-7372): Remove coercion to ByteString
        def coerce_str_to_bytes(typ):
            if typ == str:
                return ByteString

            elif hasattr(typ, '__args__') and hasattr(typ, '__origin__'):
                # Create a new type rather than modifying the existing one
                typ = typ.__origin__[tuple(
                    map(coerce_str_to_bytes, typ.__args__))]

            return typ

        if sys.version_info[0] >= 3:
            coerce_str_to_bytes = lambda x: x

        schema = named_fields_to_schema([
            (key,
             coerce_str_to_bytes(
                 convert_to_typing_type(instance_to_type(value))))
            for key, value in values.items()
        ])
        return named_tuple_from_schema(schema)(**values)
コード例 #5
0
ファイル: external.py プロジェクト: sunjincheng121/beam
    def _get_named_tuple_instance(self):
        # omit fields with value=None since we can't infer their type
        values = {
            key: value
            for key, value in self._values.items() if value is not None
        }

        # TODO(BEAM-7372): Remove coercion to ByteString
        def coerce_str_to_bytes(typ):
            if typ == str:
                return ByteString

            elif hasattr(typ, '__args__'):
                typ.__args__ = tuple(map(coerce_str_to_bytes, typ.__args__))

            return typ

        if str == unicode:
            coerce_str_to_bytes = lambda x: x

        schema = named_fields_to_schema([
            (key,
             coerce_str_to_bytes(
                 convert_to_typing_type(instance_to_type(value))))
            for key, value in values.items()
        ])
        return named_tuple_from_schema(schema)(**values)
コード例 #6
0
def value_parser_from_schema(schema):
  def attribute_parser_from_type(type_):
    # TODO: This should be exhaustive
    type_info = type_.WhichOneof("type_info")
    if type_info == "atomic_type":
      return schemas.ATOMIC_TYPE_TO_PRIMITIVE[type_.atomic_type]
    elif type_info == "array_type":
      element_parser = attribute_parser_from_type(type_.array_type.element_type)
      return lambda x: list(map(element_parser, x))
    elif type_info == "map_type":
      key_parser = attribute_parser_from_type(type_.array_type.key_type)
      value_parser = attribute_parser_from_type(type_.array_type.value_type)
      return lambda x: dict((key_parser(k), value_parser(v))
                            for k, v in x.items())

  parsers = [(field.name, attribute_parser_from_type(field.type))
             for field in schema.fields]

  constructor = schemas.named_tuple_from_schema(schema)

  def value_parser(x):
    result = []
    for name, parser in parsers:
      value = x.pop(name)
      result.append(None if value is None else parser(value))

    if len(x):
      raise ValueError(
          "Test data contains attributes that don't exist in the schema: {}"
          .format(', '.join(x.keys())))

    return constructor(*result)

  return value_parser
コード例 #7
0
 def _get_named_tuple_instance(self):
     import dataclasses
     schema = named_fields_to_schema([
         (field.name, convert_to_typing_type(field.type))
         for field in dataclasses.fields(self._transform)
     ])
     return named_tuple_from_schema(schema)(
         **dataclasses.asdict(self._transform))
コード例 #8
0
    def test_schema_with_bad_field_raises_helpful_error(self):
        schema_proto = schema_pb2.Schema(fields=[
            schema_pb2.Field(name="type_with_no_typeinfo",
                             type=schema_pb2.FieldType())
        ])

        # Should raise an exception referencing the problem field
        self.assertRaisesRegex(ValueError, "type_with_no_typeinfo",
                               lambda: named_tuple_from_schema(schema_proto))
コード例 #9
0
def value_parser_from_schema(schema):
    def attribute_parser_from_type(type_):
        parser = nonnull_attribute_parser_from_type(type_)
        if type_.nullable:
            return lambda x: None if x is None else parser(x)
        else:
            return parser

    def nonnull_attribute_parser_from_type(type_):
        # TODO: This should be exhaustive
        type_info = type_.WhichOneof("type_info")
        if type_info == "atomic_type":
            if type_.atomic_type == schema_pb2.BYTES:
                return lambda x: x.encode("utf-8")
            else:
                return schemas.ATOMIC_TYPE_TO_PRIMITIVE[type_.atomic_type]
        elif type_info == "array_type":
            element_parser = attribute_parser_from_type(
                type_.array_type.element_type)
            return lambda x: list(map(element_parser, x))
        elif type_info == "map_type":
            key_parser = attribute_parser_from_type(type_.map_type.key_type)
            value_parser = attribute_parser_from_type(
                type_.map_type.value_type)
            return lambda x: dict(
                (key_parser(k), value_parser(v)) for k, v in x.items())
        elif type_info == "row_type":
            return value_parser_from_schema(type_.row_type.schema)
        elif type_info == "logical_type":
            # In YAML logical types are represented with their representation types.
            to_language_type = schemas.LogicalType.from_runner_api(
                type_.logical_type).to_language_type
            parse_representation = attribute_parser_from_type(
                type_.logical_type.representation)
            return lambda x: to_language_type(parse_representation(x))

    parsers = [(field.name, attribute_parser_from_type(field.type))
               for field in schema.fields]

    constructor = schemas.named_tuple_from_schema(schema)

    def value_parser(x):
        result = []
        x = deepcopy(x)
        for name, parser in parsers:
            value = x.pop(name)
            result.append(None if value is None else parser(value))

        if len(x):
            raise ValueError(
                "Test data contains attributes that don't exist in the schema: {}"
                .format(', '.join(x.keys())))

        return constructor(*result)

    return value_parser
コード例 #10
0
  def test_encoding_position_add_fields_and_reorder(self):
    old_schema = schema_pb2.Schema(
        id="add_test_old",
        fields=[
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
            ),
        ])
    new_schema = schema_pb2.Schema(
        encoding_positions_set=True,
        id="add_test_new",
        fields=[
            schema_pb2.Field(
                name="f_new_str",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.STRING, nullable=True),
                encoding_position=2,
            ),
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0,
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=1,
            ),
        ])

    Old = named_tuple_from_schema(old_schema)
    New = named_tuple_from_schema(new_schema)
    roundtripped = RowCoder(new_schema).decode(
        RowCoder(old_schema).encode(Old(42, "Hello World!")))

    self.assertEqual(
        New(f_new_str=None, f_int32=42, f_str="Hello World!"), roundtripped)
コード例 #11
0
    def _get_named_tuple_instance(self):
        # omit fields with value=None since we can't infer their type
        values = {
            key: value
            for key, value in self._values.items() if value is not None
        }

        schema = named_fields_to_schema([
            (key, convert_to_typing_type(instance_to_type(value)))
            for key, value in values.items()
        ])
        return named_tuple_from_schema(schema)(**values)
コード例 #12
0
    def test_generated_class_pickle(self):
        schema = schema_pb2.Schema(
            id="some-uuid",
            fields=[
                schema_pb2.Field(
                    name='name',
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                )
            ])
        user_type = named_tuple_from_schema(schema)
        instance = user_type(name="test")

        self.assertEqual(instance, pickle.loads(pickle.dumps(instance)))
コード例 #13
0
ファイル: row_coder.py プロジェクト: melap/beam
  def __init__(self, schema):
    """Initializes a :class:`RowCoder`.

    Args:
      schema (apache_beam.portability.api.schema_pb2.Schema): The protobuf
        representation of the schema of the data that the RowCoder will be used
        to encode/decode.
    """
    self.schema = schema

    # Eagerly generate type hint to escalate any issues with the Schema proto
    self._type_hint = named_tuple_from_schema(self.schema)

    # Use non-null coders because null values are represented separately
    self.components = [
        _nonnull_coder_from_type(field.type) for field in self.schema.fields
    ]
コード例 #14
0
    def test_schema_with_bad_field_raises_helpful_error(self):
        schema_proto = schema_pb2.Schema(
            fields=[
                schema_pb2.Field(name="type_with_no_typeinfo",
                                 type=schema_pb2.FieldType())
            ],
            id="helpful-error-uuid",
        )

        # Should raise an exception referencing the problem field
        self.assertRaisesRegex(
            ValueError,
            "type_with_no_typeinfo",
            lambda: named_tuple_from_schema(
                schema_proto,
                # bypass schema cache
                schema_registry=SchemaTypeRegistry()))
コード例 #15
0
def element_type_from_dataframe(proxy, include_indexes=False):
    # type: (pd.DataFrame, bool) -> type
    """Generate an element_type for an element-wise PCollection from a proxy
  pandas object. Currently only supports converting the element_type for
  a schema-aware PCollection to a proxy DataFrame.

  Currently only supports generating a DataFrame proxy from a schema-aware
  PCollection.
  """
    output_columns = []
    if include_indexes:
        remaining_index_names = list(proxy.index.names)
        i = 0
        while len(remaining_index_names):
            index_name = remaining_index_names.pop(0)
            if index_name is None:
                raise ValueError(
                    "Encountered an unnamed index. Cannot convert to a "
                    "schema-aware PCollection with include_indexes=True. "
                    "Please name all indexes or consider not including "
                    "indexes.")
            elif index_name in remaining_index_names:
                raise ValueError(
                    "Encountered multiple indexes with the name '%s'. "
                    "Cannot convert to a schema-aware PCollection with "
                    "include_indexes=True. Please ensure all indexes have "
                    "unique names or consider not including indexes." %
                    index_name)
            elif index_name in proxy.columns:
                raise ValueError(
                    "Encountered an index that has the same name as one "
                    "of the columns, '%s'. Cannot convert to a "
                    "schema-aware PCollection with include_indexes=True. "
                    "Please ensure all indexes have unique names or "
                    "consider not including indexes." % index_name)
            else:
                # its ok!
                output_columns.append(
                    (index_name, proxy.index.get_level_values(i).dtype))
                i += 1

    output_columns.extend(zip(proxy.columns, proxy.dtypes))

    return named_tuple_from_schema(
        named_fields_to_schema([(column, _dtype_to_fieldtype(dtype))
                                for (column, dtype) in output_columns]))
コード例 #16
0
 def _from_serialized_schema(cls, schema_str):
     return cls(
         named_tuple_from_schema(
             proto_utils.parse_Bytes(schema_str, schema_pb2.Schema)))
コード例 #17
0
ファイル: row_coder.py プロジェクト: colincadams/beam
 def to_type_hint(self):
   return named_tuple_from_schema(self.schema)
コード例 #18
0
def convert_to_typing_type(type_):
    if isinstance(type_, row_type.RowTypeConstraint):
        return named_tuple_from_schema(named_fields_to_schema(type_._fields))
    else:
        return native_type_compatibility.convert_to_typing_type(type_)
コード例 #19
0
ファイル: row_coder.py プロジェクト: colincadams/beam
 def __init__(self, schema, components):
   self.schema = schema
   self.constructor = named_tuple_from_schema(schema)
   self.components = list(c.get_impl() for c in components)
   self.has_nullable_fields = any(
       field.type.nullable for field in self.schema.fields)