def resolve_record(writer, reader): '''Take a writer and reader schema and return a 'meta' schema that allows transforming a previously written record into a new read structure.''' fields = [] if writer['name'] != reader['name']: raise SchemaResolutionException( "Schemas not compatible record names don't match") record_name = reader['name'] optional = {} if "namespace" in writer and "namespace" in reader: optional["namespace"] = reader["namespace"] writer_fields = [field['name'] for field in writer['fields']] reader_fields = [field['name'] for field in reader['fields']] # check for defaults for records that are in reader # but not in writer and vice versa reader_but_not_writer = (set(reader_fields) - set(writer_fields)) writer_but_not_reader = (set(writer_fields) - set(reader_fields)) both_reader_and_writer = (set(writer_fields) & set(reader_fields)) # run through the fields in writer order for field in writer['fields']: if field['name'] in both_reader_and_writer: fields.append({ "name": field['name'], "type": resolve( field['type'], get_field_by_name(reader['fields'], field['name'])['type']) }) elif field['name'] in writer_but_not_reader: ### special skip type record fields.append({ "name": field['name'], "type": { "type": "skip", "value": field['type'] } }) for field in reader['fields']: if field['name'] in reader_but_not_writer: try: fields.append({ "name": field['name'], "type": { "type": "default", "value": field['default'] } }) except KeyError: raise SchemaResolutionException( "Schemas not compatible, no default value for field in reader's record that's not present in writer's record" ) schema = {"type": "record", "fields": fields, "name": record_name} schema.update(optional) return schema
def resolve_enum(writer, reader): '''Compare a writer and reader enum and return a compatible enum''' if writer['name'] != reader['name']: raise SchemaResolutionException( "Schemas not compatible, enum names don't match") if set(writer['symbols']) - set(reader['symbols']): raise SchemaResolutionException( "Schemas not compatible, symbol in writer's enum not present in reader's enum" ) return { 'type': 'enum', 'name': reader['name'], 'symbols': [symbol for symbol in writer['symbols']] }
def read_data(self, writers_schema, readers_schema, decoder): # schema matching if not SlowDatumReader.match_schemas(writers_schema, readers_schema): fail_msg = 'Schemas do not match.' raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) # schema resolution: reader's schema is a union, writer's schema is not if (writers_schema.type not in ['union', 'error_union'] and readers_schema.type in ['union', 'error_union']): for s in readers_schema.schemas: if SlowDatumReader.match_schemas(writers_schema, s): return self.read_data(writers_schema, s, decoder) fail_msg = 'Schemas do not match.' raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) # function dispatch for reading data based on type of writer's schema if writers_schema.type == 'null': return decoder.read_null() elif writers_schema.type == 'boolean': return decoder.read_boolean() elif writers_schema.type == 'string': return decoder.read_utf8() elif writers_schema.type == 'int': return decoder.read_int() elif writers_schema.type == 'long': return decoder.read_long() elif writers_schema.type == 'float': return decoder.read_float() elif writers_schema.type == 'double': return decoder.read_double() elif writers_schema.type == 'bytes': return decoder.read_bytes() elif writers_schema.type == 'fixed': return self.read_fixed(writers_schema, readers_schema, decoder) elif writers_schema.type == 'enum': return self.read_enum(writers_schema, readers_schema, decoder) elif writers_schema.type == 'array': return self.read_array(writers_schema, readers_schema, decoder) elif writers_schema.type == 'map': return self.read_map(writers_schema, readers_schema, decoder) elif writers_schema.type in ['union', 'error_union']: return self.read_union(writers_schema, readers_schema, decoder) elif writers_schema.type in ['record', 'error', 'request']: return self.read_record(writers_schema, readers_schema, decoder) else: fail_msg = "Cannot read unknown schema type: %s" % writers_schema.type raise schema.AvroException(fail_msg)
def writers_schema(self, parsed_writer_schema): '''Take a wrtiers schema object from the old avro API and run validation and schema reolution (if applicable) Since the old API would set this after object construction, we hook here so we can run schema validation and resolution once on assignment. This also makes the reader function for the resolved schema and uses that for data reading. ''' self._writers_schema = parsed_writer_schema # create the reader function from the schema if hasattr(self, 'readers_schema'): # to_json is a terrible method name for something # that returns a python dict! :/ resolved_schema = resolve(parsed_writer_schema.to_json(), self.readers_schema.to_json()) else: # if no reader schema, then the resolved schema is just the writers # schema resolved_schema = parsed_writer_schema.to_json() self.readers_schema = parsed_writer_schema self.read_datum = get_reader(resolved_schema) # schema matching if not FastDatumReader.match_schemas(self.writers_schema, self.readers_schema): fail_msg = 'Schemas do not match.' raise SchemaResolutionException(fail_msg, self.writers_schema, self.readers_schema)
def skip_union(self, writers_schema, decoder): index_of_schema = int(decoder.read_long()) if index_of_schema >= len(writers_schema.schemas): fail_msg = "Can't access branch index %d for union with %d branches"\ % (index_of_schema, len(writers_schema.schemas)) raise SchemaResolutionException(fail_msg, writers_schema) return self.skip_data(writers_schema.schemas[index_of_schema], decoder)
def resolve_fixed(writer, reader): '''Take a fixed writer and reader schema and return the writers size value. ''' if writer['name'] != reader['name'] or writer['size'] != reader['size']: raise SchemaResolutionException( "Schemas not compatible, fixed names or sizes don't match") return {key: value for key, value in writer.items()}
def resolve(writer, reader): '''Take a writer and a reader schema and return a meta schema that translates the writer's schema to the reader's schema. This handles skipping missing fields and default fills by creating non-standard 'types' for reader creation. These non-standard types are never surfaced out since they're not standard avro types but just used as an implementation detail for generating a write-compantible reader.''' writer_type = get_type(writer) reader_type = get_type(reader) if writer_type == reader_type: if reader_type in primitive_types: return reader if reader_type == 'array': return resolve_array(writer, reader) if reader_type == 'map': return resolve_map(writer, reader) if reader_type == 'enum': return resolve_enum(writer, reader) if reader_type == 'union': return resolve_union(writer, reader) if reader_type == "record": return resolve_record(writer, reader) if reader_type == "fixed": return resolve_fixed(writer, reader) # for named types or other types that don't match # just return the reader return reader else: # see if we've 'upgraded' to a union if reader_type == 'union': # if the writer type is in the reader's union # then just return the writer's schema if writer_type in [get_type(r) for r in reader]: type_index = [get_type(r) for r in reader].index(writer_type) return resolve(writer, reader[type_index]) else: raise SchemaResolutionException( "Writer schema not present in reader union") if writer_type in promotable and reader_type in promotable and promotable.index( writer_type) < promotable.index(reader_type): return writer raise SchemaResolutionException( "Reader and Writer schemas are incompatible")
def read_enum(self, writers_schema, readers_schema, decoder): """ An enum is encoded by a int, representing the zero-based position of the symbol in the schema. """ # read data index_of_symbol = decoder.read_int() if index_of_symbol >= len(writers_schema.symbols): fail_msg = "Can't access enum index %d for enum with %d symbols"\ % (index_of_symbol, len(writers_schema.symbols)) raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) read_symbol = writers_schema.symbols[index_of_symbol] # schema resolution if read_symbol not in readers_schema.symbols: fail_msg = "Symbol %s not present in Reader's Schema" % read_symbol raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) return read_symbol
def read_record(self, writers_schema, readers_schema, decoder): """ A record is encoded by encoding the values of its fields in the order that they are declared. In other words, a record is encoded as just the concatenation of the encodings of its fields. Field values are encoded per their schema. Schema Resolution: * the ordering of fields may be different: fields are matched by name. * schemas for fields with the same name in both records are resolved recursively. * if the writer's record contains a field with a name not present in the reader's record, the writer's value for that field is ignored. * if the reader's record schema has a field that contains a default value, and writer's schema does not have a field with the same name, then the reader should use the default value from its field. * if the reader's record schema has a field with no default value, and writer's schema does not have a field with the same name, then the field's value is unset. """ # schema resolution readers_fields_dict = readers_schema.fields_dict read_record = {} for field in writers_schema.fields: readers_field = readers_fields_dict.get(field.name) if readers_field is not None: field_val = self.read_data(field.type, readers_field.type, decoder) read_record[field.name] = field_val else: self.skip_data(field.type, decoder) # fill in default values if len(readers_fields_dict) > len(read_record): writers_fields_dict = writers_schema.fields_dict for field_name, field in readers_fields_dict.items(): if field_name not in writers_fields_dict: if field.has_default: field_val = self._read_default_value( field.type, field.default) read_record[field.name] = field_val else: fail_msg = 'No default value for field %s' % field_name raise SchemaResolutionException( fail_msg, writers_schema, readers_schema) return read_record
def read_union(self, writers_schema, readers_schema, decoder): """ A union is encoded by first writing a long value indicating the zero-based position within the union of the schema of its value. The value is then encoded per the indicated schema within the union. """ # schema resolution index_of_schema = int(decoder.read_long()) if index_of_schema >= len(writers_schema.schemas): fail_msg = "Can't access branch index %d for union with %d branches"\ % (index_of_schema, len(writers_schema.schemas)) raise SchemaResolutionException(fail_msg, writers_schema, readers_schema) selected_writers_schema = writers_schema.schemas[index_of_schema] # read data return self.read_data(selected_writers_schema, readers_schema, decoder)
def resolve_union(writer, reader): '''Take a writer union and a reader union, compare their types and return a read/write compatible union. A compatible read/write union has all of the writer's union schemas in the reader's schema. ''' union = [] for w_type in writer: for r_type in reader: try: merged = resolve(w_type, r_type) union.append(merged) break except SchemaResolutionException: # keep trying until we iterate through all read types continue else: # none of the read types matched the write type, this is an error raise SchemaResolutionException( "Schema in writer's union not present in reader's union.") return union