Ejemplo n.º 1
0
    def __init__(self,
                 fo,
                 schema,
                 codec='null',
                 sync_interval=1000 * SYNC_SIZE,
                 metadata=None,
                 validator=None):
        self.fo = fo
        self.schema = schema
        self.validate_fn = validate if validator is True else validator
        self.sync_marker = urandom(SYNC_SIZE)
        self.io = MemoryIO()
        self.block_count = 0
        self.metadata = metadata or {}
        self.metadata['avro.codec'] = codec
        self.metadata['avro.schema'] = json.dumps(schema)
        self.sync_interval = sync_interval

        try:
            self.block_writer = BLOCK_WRITERS[codec]
        except KeyError:
            raise ValueError('unrecognized codec: %r' % codec)

        write_header(self.fo, self.metadata, self.sync_marker)
        acquaint_schema(self.schema)
Ejemplo n.º 2
0
def prepare_fixed_decimal(data, schema):
    if not isinstance(data, decimal.Decimal):
        return data
    scale = schema['scale']
    size = schema['size']

    # based on https://github.com/apache/avro/pull/82/

    sign, digits, exp = data.as_tuple()

    if -exp > scale:
        raise ValueError('Scale provided in schema does not match the decimal')
    delta = exp + scale
    if delta > 0:
        digits = digits + (0, ) * delta

    unscaled_datum = 0
    for digit in digits:
        unscaled_datum = (unscaled_datum * 10) + digit

    # 2.6 support
    if not hasattr(unscaled_datum, 'bit_length'):
        bits_req = len(bin(abs(unscaled_datum))) - 2
    else:
        bits_req = unscaled_datum.bit_length() + 1

    size_in_bits = size * 8
    offset_bits = size_in_bits - bits_req

    mask = 2**size_in_bits - 1
    bit = 1
    for i in range(bits_req):
        mask ^= bit
        bit <<= 1

    if bits_req < 8:
        bytes_req = 1
    else:
        bytes_req = bits_req // 8
        if bits_req % 8 != 0:
            bytes_req += 1

    tmp = MemoryIO()

    if sign:
        unscaled_datum = (1 << bits_req) - unscaled_datum
        unscaled_datum = mask | unscaled_datum
        for index in range(size - 1, -1, -1):
            bits_to_write = unscaled_datum >> (8 * index)
            tmp.write(mk_bits(bits_to_write & 0xff))
    else:
        for i in range(offset_bits // 8):
            tmp.write(mk_bits(0))
        for index in range(bytes_req - 1, -1, -1):
            bits_to_write = unscaled_datum >> (8 * index)
            tmp.write(mk_bits(bits_to_write & 0xff))

    return tmp.getvalue()
Ejemplo n.º 3
0
def prepare_bytes_decimal(data, schema):
    if not isinstance(data, decimal.Decimal):
        return data
    scale = schema['scale']

    # based on https://github.com/apache/avro/pull/82/

    sign, digits, exp = data.as_tuple()

    if -exp > scale:
        raise AssertionError(
            'Scale provided in schema does not match the decimal')
    delta = exp + scale
    if delta > 0:
        digits = digits + (0, ) * delta

    unscaled_datum = 0
    for digit in digits:
        unscaled_datum = (unscaled_datum * 10) + digit

    # 2.6 support
    if not hasattr(unscaled_datum, 'bit_length'):
        bits_req = len(bin(abs(unscaled_datum))) - 2
    else:
        bits_req = unscaled_datum.bit_length() + 1

    if sign:
        unscaled_datum = (1 << bits_req) - unscaled_datum

    bytes_req = bits_req // 8
    padding_bits = ~((1 << bits_req) - 1) if sign else 0
    packed_bits = padding_bits | unscaled_datum

    bytes_req += 1 if (bytes_req << 3) < bits_req else 0

    tmp = MemoryIO()

    for index in range(bytes_req - 1, -1, -1):
        bits_to_write = packed_bits >> (8 * index)
        tmp.write(mk_bits(bits_to_write & 0xff))

    return tmp.getvalue()
Ejemplo n.º 4
0
 def snappy_read_block(fo):
     length = read_long(fo, None)
     data = fo.read(length - 4)
     fo.read(4)  # CRC
     return MemoryIO(snappy.decompress(data))
Ejemplo n.º 5
0
def deflate_read_block(fo):
    """Read block in "deflate" codec."""
    data = read_bytes(fo, None)
    # -15 is the log of the window size; negative indicates "raw" (no
    # zlib headers) decompression.  See zlib.h.
    return MemoryIO(decompress(data, -15))
Ejemplo n.º 6
0
def writer(fo,
           schema,
           records,
           codec='null',
           sync_interval=1000 * SYNC_SIZE,
           metadata=None):
    """Write records to fo (stream) according to schema

    Paramaters
    ----------
    fo: file like
        Output stream
    records: iterable
        Records to write
    codec: string, optional
        Compression codec, can be 'null', 'deflate' or 'snappy' (if installed)
    sync_interval: int, optional
        Size of sync interval
    metadata: dict, optional
        Header metadata


    Example
    -------

    >>> from fastavro import writer

    >>> schema = {
    >>>     'doc': 'A weather reading.',
    >>>     'name': 'Weather',
    >>>     'namespace': 'test',
    >>>     'type': 'record',
    >>>     'fields': [
    >>>         {'name': 'station', 'type': 'string'},
    >>>         {'name': 'time', 'type': 'long'},
    >>>         {'name': 'temp', 'type': 'int'},
    >>>     ],
    >>> }

    >>> records = [
    >>>     {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},
    >>>     {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},
    >>>     {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},
    >>>     {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},
    >>> ]

    >>> with open('weather.avro', 'wb') as out:
    >>>     writer(out, schema, records)
    """
    sync_marker = urandom(SYNC_SIZE)
    io = MemoryIO()
    block_count = 0
    metadata = metadata or {}
    metadata['avro.codec'] = codec
    metadata['avro.schema'] = json.dumps(schema)

    try:
        block_writer = BLOCK_WRITERS[codec]
    except KeyError:
        raise ValueError('unrecognized codec: %r' % codec)

    def dump():
        write_long(fo, block_count)
        block_writer(fo, io.getvalue())
        fo.write(sync_marker)
        io.truncate(0)
        io.seek(0, SEEK_SET)

    write_header(fo, metadata, sync_marker)
    acquaint_schema(schema)

    for record in records:
        write_data(io, record, schema)
        block_count += 1
        if io.tell() >= sync_interval:
            dump()
            block_count = 0

    if io.tell() or block_count > 0:
        dump()

    fo.flush()