Exemple #1
0
def resolve_record(writer, reader):
    '''Take a writer and reader schema and return a 'meta' schema that allows
    transforming a previously written record into a new read structure.'''
    fields = []
    if writer['name'] != reader['name']:
        raise SchemaResolutionException(
            "Schemas not compatible record names don't match")
    record_name = reader['name']
    optional = {}
    if "namespace" in writer and "namespace" in reader:
        optional["namespace"] = reader["namespace"]

    writer_fields = [field['name'] for field in writer['fields']]
    reader_fields = [field['name'] for field in reader['fields']]
    # check for defaults for records that are in reader
    # but not in writer and vice versa
    reader_but_not_writer = (set(reader_fields) - set(writer_fields))
    writer_but_not_reader = (set(writer_fields) - set(reader_fields))
    both_reader_and_writer = (set(writer_fields) & set(reader_fields))
    # run through the fields in writer order
    for field in writer['fields']:
        if field['name'] in both_reader_and_writer:
            fields.append({
                "name":
                field['name'],
                "type":
                resolve(
                    field['type'],
                    get_field_by_name(reader['fields'], field['name'])['type'])
            })
        elif field['name'] in writer_but_not_reader:
            ### special skip type record
            fields.append({
                "name": field['name'],
                "type": {
                    "type": "skip",
                    "value": field['type']
                }
            })

    for field in reader['fields']:
        if field['name'] in reader_but_not_writer:
            try:
                fields.append({
                    "name": field['name'],
                    "type": {
                        "type": "default",
                        "value": field['default']
                    }
                })
            except KeyError:
                raise SchemaResolutionException(
                    "Schemas not compatible, no default value for field in reader's record that's not present in writer's record"
                )
    schema = {"type": "record", "fields": fields, "name": record_name}
    schema.update(optional)
    return schema
Exemple #2
0
def resolve_enum(writer, reader):
    '''Compare a writer and reader enum and return a compatible enum'''
    if writer['name'] != reader['name']:
        raise SchemaResolutionException(
            "Schemas not compatible, enum names don't match")
    if set(writer['symbols']) - set(reader['symbols']):
        raise SchemaResolutionException(
            "Schemas not compatible, symbol in writer's enum not present in reader's enum"
        )
    return {
        'type': 'enum',
        'name': reader['name'],
        'symbols': [symbol for symbol in writer['symbols']]
    }
Exemple #3
0
    def read_data(self, writers_schema, readers_schema, decoder):
        # schema matching
        if not SlowDatumReader.match_schemas(writers_schema, readers_schema):
            fail_msg = 'Schemas do not match.'
            raise SchemaResolutionException(fail_msg, writers_schema,
                                            readers_schema)

        # schema resolution: reader's schema is a union, writer's schema is not
        if (writers_schema.type not in ['union', 'error_union']
                and readers_schema.type in ['union', 'error_union']):
            for s in readers_schema.schemas:
                if SlowDatumReader.match_schemas(writers_schema, s):
                    return self.read_data(writers_schema, s, decoder)
            fail_msg = 'Schemas do not match.'
            raise SchemaResolutionException(fail_msg, writers_schema,
                                            readers_schema)

        # function dispatch for reading data based on type of writer's schema
        if writers_schema.type == 'null':
            return decoder.read_null()
        elif writers_schema.type == 'boolean':
            return decoder.read_boolean()
        elif writers_schema.type == 'string':
            return decoder.read_utf8()
        elif writers_schema.type == 'int':
            return decoder.read_int()
        elif writers_schema.type == 'long':
            return decoder.read_long()
        elif writers_schema.type == 'float':
            return decoder.read_float()
        elif writers_schema.type == 'double':
            return decoder.read_double()
        elif writers_schema.type == 'bytes':
            return decoder.read_bytes()
        elif writers_schema.type == 'fixed':
            return self.read_fixed(writers_schema, readers_schema, decoder)
        elif writers_schema.type == 'enum':
            return self.read_enum(writers_schema, readers_schema, decoder)
        elif writers_schema.type == 'array':
            return self.read_array(writers_schema, readers_schema, decoder)
        elif writers_schema.type == 'map':
            return self.read_map(writers_schema, readers_schema, decoder)
        elif writers_schema.type in ['union', 'error_union']:
            return self.read_union(writers_schema, readers_schema, decoder)
        elif writers_schema.type in ['record', 'error', 'request']:
            return self.read_record(writers_schema, readers_schema, decoder)
        else:
            fail_msg = "Cannot read unknown schema type: %s" % writers_schema.type
            raise schema.AvroException(fail_msg)
Exemple #4
0
    def writers_schema(self, parsed_writer_schema):
        '''Take a wrtiers schema object from the old avro API and run validation
        and schema reolution (if applicable)

        Since the old API would set this after object construction, we hook
        here so we can run schema validation and resolution once on assignment.
        This also makes the reader function for the resolved schema and uses
        that for data reading.
        '''
        self._writers_schema = parsed_writer_schema

        # create the reader function from the schema
        if hasattr(self, 'readers_schema'):
            # to_json is a terrible method name for something
            # that returns a python dict! :/
            resolved_schema = resolve(parsed_writer_schema.to_json(),
                                      self.readers_schema.to_json())
        else:
            # if no reader schema, then the resolved schema is just the writers
            # schema
            resolved_schema = parsed_writer_schema.to_json()
            self.readers_schema = parsed_writer_schema
        self.read_datum = get_reader(resolved_schema)

        # schema matching
        if not FastDatumReader.match_schemas(self.writers_schema,
                                             self.readers_schema):
            fail_msg = 'Schemas do not match.'
            raise SchemaResolutionException(fail_msg, self.writers_schema,
                                            self.readers_schema)
Exemple #5
0
 def skip_union(self, writers_schema, decoder):
     index_of_schema = int(decoder.read_long())
     if index_of_schema >= len(writers_schema.schemas):
         fail_msg = "Can't access branch index %d for union with %d branches"\
                              % (index_of_schema, len(writers_schema.schemas))
         raise SchemaResolutionException(fail_msg, writers_schema)
     return self.skip_data(writers_schema.schemas[index_of_schema], decoder)
Exemple #6
0
def resolve_fixed(writer, reader):
    '''Take a fixed writer and reader schema and return the writers size value.
    '''
    if writer['name'] != reader['name'] or writer['size'] != reader['size']:
        raise SchemaResolutionException(
            "Schemas not compatible, fixed names or sizes don't match")
    return {key: value for key, value in writer.items()}
Exemple #7
0
def resolve(writer, reader):
    '''Take a writer and a reader schema and return a meta schema that
    translates the writer's schema to the reader's schema.

    This handles skipping missing fields and default fills by creating
    non-standard 'types' for reader creation. These non-standard types are
    never surfaced out since they're not standard avro types but just used
    as an implementation detail for generating a write-compantible reader.'''
    writer_type = get_type(writer)
    reader_type = get_type(reader)

    if writer_type == reader_type:
        if reader_type in primitive_types:
            return reader
        if reader_type == 'array':
            return resolve_array(writer, reader)
        if reader_type == 'map':
            return resolve_map(writer, reader)
        if reader_type == 'enum':
            return resolve_enum(writer, reader)
        if reader_type == 'union':
            return resolve_union(writer, reader)
        if reader_type == "record":
            return resolve_record(writer, reader)
        if reader_type == "fixed":
            return resolve_fixed(writer, reader)
        # for named types or other types that don't match
        # just return the reader
        return reader
    else:
        # see if we've 'upgraded' to a union
        if reader_type == 'union':
            # if the writer type is in the reader's union
            # then just return the writer's schema
            if writer_type in [get_type(r) for r in reader]:
                type_index = [get_type(r) for r in reader].index(writer_type)
                return resolve(writer, reader[type_index])
            else:
                raise SchemaResolutionException(
                    "Writer schema not present in reader union")
        if writer_type in promotable and reader_type in promotable and promotable.index(
                writer_type) < promotable.index(reader_type):
            return writer
        raise SchemaResolutionException(
            "Reader and Writer schemas are incompatible")
Exemple #8
0
    def read_enum(self, writers_schema, readers_schema, decoder):
        """
        An enum is encoded by a int, representing the zero-based position
        of the symbol in the schema.
        """
        # read data
        index_of_symbol = decoder.read_int()
        if index_of_symbol >= len(writers_schema.symbols):
            fail_msg = "Can't access enum index %d for enum with %d symbols"\
                                 % (index_of_symbol, len(writers_schema.symbols))
            raise SchemaResolutionException(fail_msg, writers_schema,
                                            readers_schema)
        read_symbol = writers_schema.symbols[index_of_symbol]

        # schema resolution
        if read_symbol not in readers_schema.symbols:
            fail_msg = "Symbol %s not present in Reader's Schema" % read_symbol
            raise SchemaResolutionException(fail_msg, writers_schema,
                                            readers_schema)

        return read_symbol
Exemple #9
0
    def read_record(self, writers_schema, readers_schema, decoder):
        """
        A record is encoded by encoding the values of its fields
        in the order that they are declared. In other words, a record
        is encoded as just the concatenation of the encodings of its fields.
        Field values are encoded per their schema.

        Schema Resolution:
         * the ordering of fields may be different: fields are matched by name.
         * schemas for fields with the same name in both records are resolved
             recursively.
         * if the writer's record contains a field with a name not present in the
             reader's record, the writer's value for that field is ignored.
         * if the reader's record schema has a field that contains a default value,
             and writer's schema does not have a field with the same name, then the
             reader should use the default value from its field.
         * if the reader's record schema has a field with no default value, and 
             writer's schema does not have a field with the same name, then the
             field's value is unset.
        """
        # schema resolution
        readers_fields_dict = readers_schema.fields_dict
        read_record = {}
        for field in writers_schema.fields:
            readers_field = readers_fields_dict.get(field.name)
            if readers_field is not None:
                field_val = self.read_data(field.type, readers_field.type,
                                           decoder)
                read_record[field.name] = field_val
            else:
                self.skip_data(field.type, decoder)

        # fill in default values
        if len(readers_fields_dict) > len(read_record):
            writers_fields_dict = writers_schema.fields_dict
            for field_name, field in readers_fields_dict.items():
                if field_name not in writers_fields_dict:
                    if field.has_default:
                        field_val = self._read_default_value(
                            field.type, field.default)
                        read_record[field.name] = field_val
                    else:
                        fail_msg = 'No default value for field %s' % field_name
                        raise SchemaResolutionException(
                            fail_msg, writers_schema, readers_schema)
        return read_record
Exemple #10
0
    def read_union(self, writers_schema, readers_schema, decoder):
        """
        A union is encoded by first writing a long value indicating
        the zero-based position within the union of the schema of its value.
        The value is then encoded per the indicated schema within the union.
        """
        # schema resolution
        index_of_schema = int(decoder.read_long())
        if index_of_schema >= len(writers_schema.schemas):
            fail_msg = "Can't access branch index %d for union with %d branches"\
                                 % (index_of_schema, len(writers_schema.schemas))
            raise SchemaResolutionException(fail_msg, writers_schema,
                                            readers_schema)
        selected_writers_schema = writers_schema.schemas[index_of_schema]

        # read data
        return self.read_data(selected_writers_schema, readers_schema, decoder)
Exemple #11
0
def resolve_union(writer, reader):
    '''Take a writer union and a reader union, compare their types and return
    a read/write compatible union.

    A compatible read/write union has all of the writer's union schemas in the
    reader's schema.
    '''
    union = []
    for w_type in writer:
        for r_type in reader:
            try:
                merged = resolve(w_type, r_type)
                union.append(merged)
                break
            except SchemaResolutionException:
                # keep trying until we iterate through all read types
                continue
        else:
            # none of the read types matched the write type, this is an error
            raise SchemaResolutionException(
                "Schema in writer's union not present in reader's union.")
    return union