def test_encoding_position_reorder_fields(self): schema1 = schema_pb2.Schema( id="reorder_test_schema1", fields=[ schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), ), schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), ), ]) schema2 = schema_pb2.Schema( id="reorder_test_schema2", encoding_positions_set=True, fields=[ schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), encoding_position=1, ), schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), encoding_position=0, ), ]) RowSchema1 = named_tuple_from_schema(schema1) RowSchema2 = named_tuple_from_schema(schema2) roundtripped = RowCoder(schema2).decode( RowCoder(schema1).encode(RowSchema1(42, "Hello World!"))) self.assertEqual(RowSchema2(f_int32=42, f_str="Hello World!"), roundtripped)
def _get_named_tuple_instance(self): schema = named_fields_to_schema([ (k, convert_to_typing_type(v)) for k, v in self._transform.__init__.__annotations__.items() if k in self._values ]) return named_tuple_from_schema(schema)(**self._values)
def _get_schema_proto_and_payload(self, *args, **kwargs): named_fields = [] fields_to_values = OrderedDict() next_field_id = 0 for value in args: if value is None: raise ValueError( 'Received value None. None values are currently not supported' ) named_fields.append( ((JavaClassLookupPayloadBuilder.IGNORED_ARG_FORMAT % next_field_id), convert_to_typing_type(instance_to_type(value)))) fields_to_values[( JavaClassLookupPayloadBuilder.IGNORED_ARG_FORMAT % next_field_id)] = value next_field_id += 1 for key, value in kwargs.items(): if not key: raise ValueError('Parameter name cannot be empty') if value is None: raise ValueError( 'Received value None for key %s. None values are currently not ' 'supported' % key) named_fields.append( (key, convert_to_typing_type(instance_to_type(value)))) fields_to_values[key] = value schema_proto = named_fields_to_schema(named_fields) row = named_tuple_from_schema(schema_proto)(**fields_to_values) schema = named_tuple_to_schema(type(row)) payload = RowCoder(schema).encode(row) return (schema_proto, payload)
def _get_named_tuple_instance(self): # omit fields with value=None since we can't infer their type values = { key: value for key, value in self._values.items() if value is not None } # In python 2 named_fields_to_schema will not accept str because its # ambiguous. This converts str hints to ByteString recursively so its clear # we intend to use BYTES. # TODO(BEAM-7372): Remove coercion to ByteString def coerce_str_to_bytes(typ): if typ == str: return ByteString elif hasattr(typ, '__args__') and hasattr(typ, '__origin__'): # Create a new type rather than modifying the existing one typ = typ.__origin__[tuple( map(coerce_str_to_bytes, typ.__args__))] return typ if sys.version_info[0] >= 3: coerce_str_to_bytes = lambda x: x schema = named_fields_to_schema([ (key, coerce_str_to_bytes( convert_to_typing_type(instance_to_type(value)))) for key, value in values.items() ]) return named_tuple_from_schema(schema)(**values)
def _get_named_tuple_instance(self): # omit fields with value=None since we can't infer their type values = { key: value for key, value in self._values.items() if value is not None } # TODO(BEAM-7372): Remove coercion to ByteString def coerce_str_to_bytes(typ): if typ == str: return ByteString elif hasattr(typ, '__args__'): typ.__args__ = tuple(map(coerce_str_to_bytes, typ.__args__)) return typ if str == unicode: coerce_str_to_bytes = lambda x: x schema = named_fields_to_schema([ (key, coerce_str_to_bytes( convert_to_typing_type(instance_to_type(value)))) for key, value in values.items() ]) return named_tuple_from_schema(schema)(**values)
def value_parser_from_schema(schema): def attribute_parser_from_type(type_): # TODO: This should be exhaustive type_info = type_.WhichOneof("type_info") if type_info == "atomic_type": return schemas.ATOMIC_TYPE_TO_PRIMITIVE[type_.atomic_type] elif type_info == "array_type": element_parser = attribute_parser_from_type(type_.array_type.element_type) return lambda x: list(map(element_parser, x)) elif type_info == "map_type": key_parser = attribute_parser_from_type(type_.array_type.key_type) value_parser = attribute_parser_from_type(type_.array_type.value_type) return lambda x: dict((key_parser(k), value_parser(v)) for k, v in x.items()) parsers = [(field.name, attribute_parser_from_type(field.type)) for field in schema.fields] constructor = schemas.named_tuple_from_schema(schema) def value_parser(x): result = [] for name, parser in parsers: value = x.pop(name) result.append(None if value is None else parser(value)) if len(x): raise ValueError( "Test data contains attributes that don't exist in the schema: {}" .format(', '.join(x.keys()))) return constructor(*result) return value_parser
def _get_named_tuple_instance(self): import dataclasses schema = named_fields_to_schema([ (field.name, convert_to_typing_type(field.type)) for field in dataclasses.fields(self._transform) ]) return named_tuple_from_schema(schema)( **dataclasses.asdict(self._transform))
def test_schema_with_bad_field_raises_helpful_error(self): schema_proto = schema_pb2.Schema(fields=[ schema_pb2.Field(name="type_with_no_typeinfo", type=schema_pb2.FieldType()) ]) # Should raise an exception referencing the problem field self.assertRaisesRegex(ValueError, "type_with_no_typeinfo", lambda: named_tuple_from_schema(schema_proto))
def value_parser_from_schema(schema): def attribute_parser_from_type(type_): parser = nonnull_attribute_parser_from_type(type_) if type_.nullable: return lambda x: None if x is None else parser(x) else: return parser def nonnull_attribute_parser_from_type(type_): # TODO: This should be exhaustive type_info = type_.WhichOneof("type_info") if type_info == "atomic_type": if type_.atomic_type == schema_pb2.BYTES: return lambda x: x.encode("utf-8") else: return schemas.ATOMIC_TYPE_TO_PRIMITIVE[type_.atomic_type] elif type_info == "array_type": element_parser = attribute_parser_from_type( type_.array_type.element_type) return lambda x: list(map(element_parser, x)) elif type_info == "map_type": key_parser = attribute_parser_from_type(type_.map_type.key_type) value_parser = attribute_parser_from_type( type_.map_type.value_type) return lambda x: dict( (key_parser(k), value_parser(v)) for k, v in x.items()) elif type_info == "row_type": return value_parser_from_schema(type_.row_type.schema) elif type_info == "logical_type": # In YAML logical types are represented with their representation types. to_language_type = schemas.LogicalType.from_runner_api( type_.logical_type).to_language_type parse_representation = attribute_parser_from_type( type_.logical_type.representation) return lambda x: to_language_type(parse_representation(x)) parsers = [(field.name, attribute_parser_from_type(field.type)) for field in schema.fields] constructor = schemas.named_tuple_from_schema(schema) def value_parser(x): result = [] x = deepcopy(x) for name, parser in parsers: value = x.pop(name) result.append(None if value is None else parser(value)) if len(x): raise ValueError( "Test data contains attributes that don't exist in the schema: {}" .format(', '.join(x.keys()))) return constructor(*result) return value_parser
def test_encoding_position_add_fields_and_reorder(self): old_schema = schema_pb2.Schema( id="add_test_old", fields=[ schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), ), schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), ), ]) new_schema = schema_pb2.Schema( encoding_positions_set=True, id="add_test_new", fields=[ schema_pb2.Field( name="f_new_str", type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING, nullable=True), encoding_position=2, ), schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), encoding_position=0, ), schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), encoding_position=1, ), ]) Old = named_tuple_from_schema(old_schema) New = named_tuple_from_schema(new_schema) roundtripped = RowCoder(new_schema).decode( RowCoder(old_schema).encode(Old(42, "Hello World!"))) self.assertEqual( New(f_new_str=None, f_int32=42, f_str="Hello World!"), roundtripped)
def _get_named_tuple_instance(self): # omit fields with value=None since we can't infer their type values = { key: value for key, value in self._values.items() if value is not None } schema = named_fields_to_schema([ (key, convert_to_typing_type(instance_to_type(value))) for key, value in values.items() ]) return named_tuple_from_schema(schema)(**values)
def test_generated_class_pickle(self): schema = schema_pb2.Schema( id="some-uuid", fields=[ schema_pb2.Field( name='name', type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), ) ]) user_type = named_tuple_from_schema(schema) instance = user_type(name="test") self.assertEqual(instance, pickle.loads(pickle.dumps(instance)))
def __init__(self, schema): """Initializes a :class:`RowCoder`. Args: schema (apache_beam.portability.api.schema_pb2.Schema): The protobuf representation of the schema of the data that the RowCoder will be used to encode/decode. """ self.schema = schema # Eagerly generate type hint to escalate any issues with the Schema proto self._type_hint = named_tuple_from_schema(self.schema) # Use non-null coders because null values are represented separately self.components = [ _nonnull_coder_from_type(field.type) for field in self.schema.fields ]
def test_schema_with_bad_field_raises_helpful_error(self): schema_proto = schema_pb2.Schema( fields=[ schema_pb2.Field(name="type_with_no_typeinfo", type=schema_pb2.FieldType()) ], id="helpful-error-uuid", ) # Should raise an exception referencing the problem field self.assertRaisesRegex( ValueError, "type_with_no_typeinfo", lambda: named_tuple_from_schema( schema_proto, # bypass schema cache schema_registry=SchemaTypeRegistry()))
def element_type_from_dataframe(proxy, include_indexes=False): # type: (pd.DataFrame, bool) -> type """Generate an element_type for an element-wise PCollection from a proxy pandas object. Currently only supports converting the element_type for a schema-aware PCollection to a proxy DataFrame. Currently only supports generating a DataFrame proxy from a schema-aware PCollection. """ output_columns = [] if include_indexes: remaining_index_names = list(proxy.index.names) i = 0 while len(remaining_index_names): index_name = remaining_index_names.pop(0) if index_name is None: raise ValueError( "Encountered an unnamed index. Cannot convert to a " "schema-aware PCollection with include_indexes=True. " "Please name all indexes or consider not including " "indexes.") elif index_name in remaining_index_names: raise ValueError( "Encountered multiple indexes with the name '%s'. " "Cannot convert to a schema-aware PCollection with " "include_indexes=True. Please ensure all indexes have " "unique names or consider not including indexes." % index_name) elif index_name in proxy.columns: raise ValueError( "Encountered an index that has the same name as one " "of the columns, '%s'. Cannot convert to a " "schema-aware PCollection with include_indexes=True. " "Please ensure all indexes have unique names or " "consider not including indexes." % index_name) else: # its ok! output_columns.append( (index_name, proxy.index.get_level_values(i).dtype)) i += 1 output_columns.extend(zip(proxy.columns, proxy.dtypes)) return named_tuple_from_schema( named_fields_to_schema([(column, _dtype_to_fieldtype(dtype)) for (column, dtype) in output_columns]))
def _from_serialized_schema(cls, schema_str): return cls( named_tuple_from_schema( proto_utils.parse_Bytes(schema_str, schema_pb2.Schema)))
def to_type_hint(self): return named_tuple_from_schema(self.schema)
def convert_to_typing_type(type_): if isinstance(type_, row_type.RowTypeConstraint): return named_tuple_from_schema(named_fields_to_schema(type_._fields)) else: return native_type_compatibility.convert_to_typing_type(type_)
def __init__(self, schema, components): self.schema = schema self.constructor = named_tuple_from_schema(schema) self.components = list(c.get_impl() for c in components) self.has_nullable_fields = any( field.type.nullable for field in self.schema.fields)