Esempio n. 1
0
def write_header(fo, metadata, sync_marker):
    header = {
        'magic': MAGIC,
        'meta':
        dict([(key, utob(value)) for key, value in iteritems(metadata)]),
        'sync': sync_marker
    }
    write_data(fo, header, HEADER_SCHEMA)
Esempio n. 2
0
def write_header(fo, metadata, sync_marker):
    header = {
        'magic': MAGIC,
        'meta': dict([(key, utob(value)) for key, value in
                      iteritems(metadata)]),
        'sync': sync_marker
    }
    write_data(fo, header, HEADER_SCHEMA)
Esempio n. 3
0
def read_record(fo, writer_schema, reader_schema=None):
    """A record is encoded by encoding the values of its fields in the order
    that they are declared. In other words, a record is encoded as just the
    concatenation of the encodings of its fields.  Field values are encoded per
    their schema.

    Schema Resolution:
     * the ordering of fields may be different: fields are matched by name.
     * schemas for fields with the same name in both records are resolved
         recursively.
     * if the writer's record contains a field with a name not present in the
         reader's record, the writer's value for that field is ignored.
     * if the reader's record schema has a field that contains a default value,
         and writer's schema does not have a field with the same name, then the
         reader should use the default value from its field.
     * if the reader's record schema has a field with no default value, and
         writer's schema does not have a field with the same name, then the
         field's value is unset.
    """
    record = {}
    if reader_schema is None:
        for field in writer_schema['fields']:
            record[field['name']] = read_data(fo, field['type'])
    else:
        readers_field_dict = \
            dict((f['name'], f) for f in reader_schema['fields'])
        for field in writer_schema['fields']:
            readers_field = readers_field_dict.get(field['name'])
            if readers_field:
                record[field['name']] = read_data(fo,
                                                  field['type'],
                                                  readers_field['type'])
            else:
                # should implement skip
                read_data(fo, field['type'], field['type'])

        # fill in default values
        if len(readers_field_dict) > len(record):
            writer_fields = [f['name'] for f in writer_schema['fields']]
            for field_name, field in iteritems(readers_field_dict):
                if field_name not in writer_fields:
                    default = field.get('default')
                    if default:
                        record[field['name']] = default
                    else:
                        msg = 'No default value for %s' % field['name']
                        raise SchemaResolutionError(msg)

    return record
Esempio n. 4
0
def read_record(fo, writer_schema, reader_schema=None):
    """A record is encoded by encoding the values of its fields in the order
    that they are declared. In other words, a record is encoded as just the
    concatenation of the encodings of its fields.  Field values are encoded per
    their schema.

    Schema Resolution:
     * the ordering of fields may be different: fields are matched by name.
     * schemas for fields with the same name in both records are resolved
         recursively.
     * if the writer's record contains a field with a name not present in the
         reader's record, the writer's value for that field is ignored.
     * if the reader's record schema has a field that contains a default value,
         and writer's schema does not have a field with the same name, then the
         reader should use the default value from its field.
     * if the reader's record schema has a field with no default value, and
         writer's schema does not have a field with the same name, then the
         field's value is unset.
    """
    record = {}
    if reader_schema is None:
        for field in writer_schema['fields']:
            record[field['name']] = read_data(fo, field['type'])
    else:
        readers_field_dict = \
            dict((f['name'], f) for f in reader_schema['fields'])
        for field in writer_schema['fields']:
            readers_field = readers_field_dict.get(field['name'])
            if readers_field:
                record[field['name']] = read_data(fo,
                                                  field['type'],
                                                  readers_field['type'])
            else:
                # should implement skip
                read_data(fo, field['type'], field['type'])

        # fill in default values
        if len(readers_field_dict) > len(record):
            writer_fields = [f['name'] for f in writer_schema['fields']]
            for field_name, field in iteritems(readers_field_dict):
                if field_name not in writer_fields:
                    default = field.get('default')
                    if 'default' in field:
                        record[field['name']] = default
                    else:
                        msg = 'No default value for %s' % field['name']
                        raise SchemaResolutionError(msg)

    return record
Esempio n. 5
0
def write_map(fo, datum, schema):
    """Maps are encoded as a series of blocks.

    Each block consists of a long count value, followed by that many key/value
    pairs.  A block with count zero indicates the end of the map.  Each item is
    encoded per the map's value schema.

    If a block's count is negative, then the count is followed immediately by a
    long block size, indicating the number of bytes in the block. The actual
    count in this case is the absolute value of the count written."""
    if len(datum) > 0:
        write_long(fo, len(datum))
        vtype = schema['values']
        for key, val in iteritems(datum):
            write_utf8(fo, key)
            write_data(fo, val, vtype)
    write_long(fo, 0)
Esempio n. 6
0
def write_map(fo, datum, schema):
    """Maps are encoded as a series of blocks.

    Each block consists of a long count value, followed by that many key/value
    pairs.  A block with count zero indicates the end of the map.  Each item is
    encoded per the map's value schema.

    If a block's count is negative, then the count is followed immediately by a
    long block size, indicating the number of bytes in the block. The actual
    count in this case is the absolute value of the count written."""
    if len(datum) > 0:
        write_long(fo, len(datum))
        vtype = schema['values']
        for key, val in iteritems(datum):
            write_utf8(fo, key)
            write_data(fo, val, vtype)
    write_long(fo, 0)
Esempio n. 7
0
    def __init__(self, fo, reader_schema=None):
        """Creates a new iterator

        Paramaters
        ----------
        fo: file like
            Input stream
        reader_schema: dict, optional
            Reader schema

        Example
        -------
        >>> with open('some-file.avro', 'rb') as fo:
        >>>     avro = iter_avro(fo)
        >>>     schema = avro.schema
        >>>     for record in avro:
        >>>         process_record(record)
        """
        self.fo = fo
        try:
            self._header = read_data(fo, HEADER_SCHEMA)
        except StopIteration:
            raise ValueError('cannot read header - is it an avro file?')

        # `meta` values are bytes. So, the actual decoding has to be external.
        self.metadata = \
            dict((k, btou(v)) for k, v in iteritems(self._header['meta']))

        self.schema = self.writer_schema = \
            json.loads(self.metadata['avro.schema'])
        self.codec = self.metadata.get('avro.codec', 'null')
        self.reader_schema = reader_schema

        acquaint_schema(self.writer_schema, READERS)
        if reader_schema:
            populate_schema_defs(reader_schema, SCHEMA_DEFS)
        self._records = _iter_avro(fo,
                                   self._header,
                                   self.codec,
                                   self.writer_schema,
                                   reader_schema)
Esempio n. 8
0
    def __init__(self, fo, reader_schema=None):
        """Creates a new iterator

        Paramaters
        ----------
        fo: file like
            Input stream
        reader_schema: dict, optional
            Reader schema

        Example
        -------
        >>> with open('some-file.avro', 'rb') as fo:
        >>>     avro = iter_avro(fo)
        >>>     schema = avro.schema
        >>>     for record in avro:
        >>>         process_record(record)
        """
        self.fo = fo
        try:
            self._header = read_data(fo, HEADER_SCHEMA)
        except StopIteration:
            raise ValueError('cannot read header - is it an avro file?')

        # `meta` values are bytes. So, the actual decoding has to be external.
        self.metadata = \
            dict((k, btou(v)) for k, v in iteritems(self._header['meta']))

        self.schema = self.writer_schema = \
            json.loads(self.metadata['avro.schema'])
        self.codec = self.metadata.get('avro.codec', 'null')
        self.reader_schema = reader_schema

        acquaint_schema(self.writer_schema, READERS)
        if reader_schema:
            populate_schema_defs(reader_schema)
        self._records = _iter_avro(fo,
                                   self._header,
                                   self.codec,
                                   self.writer_schema,
                                   reader_schema)
Esempio n. 9
0
    def __init__(self, fo, reader_schema=None):
        self.fo = fo
        try:
            self._header = read_data(fo, HEADER_SCHEMA)
        except StopIteration:
            raise ValueError('cannot read header - is it an avro file?')

        # `meta` values are bytes. So, the actual decoding has to be external.
        self.metadata = \
            dict([(k, btou(v)) for k, v in iteritems(self._header['meta'])])

        self.schema = self.writer_schema = \
            json.loads(self.metadata['avro.schema'])
        self.codec = self.metadata.get('avro.codec', 'null')
        self.reader_schema = reader_schema

        acquaint_schema(self.writer_schema, READERS)
        if reader_schema:
            populate_schema_defs(reader_schema, SCHEMA_DEFS)
        self._records = _iter_avro(fo,
                                   self._header,
                                   self.codec,
                                   self.writer_schema,
                                   reader_schema)