Example #1
0
def read_bytes_decimal(data, writer_schema=None, reader_schema=None):
    """
    Decimal is encoded as fixed. Fixed instances are encoded using the
    number of bytes declared in the schema.
    based on https://github.com/apache/avro/pull/82/
    """
    scale = writer_schema['scale']
    precision = writer_schema['precision']

    size = len(data)

    datum_byte = str2ints(data)

    unscaled_datum = 0
    msb = fstint(data)
    leftmost_bit = (msb >> 7) & 1
    if leftmost_bit == 1:
        modified_first_byte = datum_byte[0] ^ (1 << 7)
        datum_byte = [modified_first_byte] + datum_byte[1:]
        for offset in xrange(size):
            unscaled_datum <<= 8
            unscaled_datum += datum_byte[offset]
        unscaled_datum += pow(-2, (size * 8) - 1)
    else:
        for offset in xrange(size):
            unscaled_datum <<= 8
            unscaled_datum += (datum_byte[offset])

    with localcontext() as ctx:
        ctx.prec = precision
        scaled_datum = Decimal(unscaled_datum).scaleb(-scale)
    return scaled_datum
Example #2
0
def _read_decimal(data, size, writer_schema):
    """
    based on https://github.com/apache/avro/pull/82/
    """
    scale = writer_schema['scale']
    precision = writer_schema['precision']

    datum_byte = str2ints(data)

    unscaled_datum = 0
    msb = fstint(data)
    leftmost_bit = (msb >> 7) & 1
    if leftmost_bit == 1:
        modified_first_byte = datum_byte[0] ^ (1 << 7)
        datum_byte = [modified_first_byte] + datum_byte[1:]
        for offset in xrange(size):
            unscaled_datum <<= 8
            unscaled_datum += datum_byte[offset]
        unscaled_datum += pow(-2, (size * 8) - 1)
    else:
        for offset in xrange(size):
            unscaled_datum <<= 8
            unscaled_datum += (datum_byte[offset])

    with localcontext() as ctx:
        ctx.prec = precision
        scaled_datum = Decimal(unscaled_datum).scaleb(-scale)
    return scaled_datum
Example #3
0
def read_map(fo, writer_schema, reader_schema=None):
    """Maps are encoded as a series of blocks.

    Each block consists of a long count value, followed by that many key/value
    pairs.  A block with count zero indicates the end of the map.  Each item is
    encoded per the map's value schema.

    If a block's count is negative, then the count is followed immediately by a
    long block size, indicating the number of bytes in the block.  The actual
    count in this case is the absolute value of the count written.
    """
    if reader_schema:
        def item_reader(fo, w_schema, r_schema):
            return read_data(fo, w_schema['values'], r_schema['values'])
    else:
        def item_reader(fo, w_schema, _):
            return read_data(fo, w_schema['values'])

    read_items = {}
    block_count = read_long(fo)
    while block_count != 0:
        if block_count < 0:
            block_count = -block_count
            # Read block size, unused
            read_long(fo)

        for i in xrange(block_count):
            key = read_utf8(fo)
            read_items[key] = item_reader(fo, writer_schema, reader_schema)
        block_count = read_long(fo)

    return read_items
Example #4
0
def read_array(fo, writer_schema, reader_schema=None):
    """Arrays are encoded as a series of blocks.

    Each block consists of a long count value, followed by that many array
    items.  A block with count zero indicates the end of the array.  Each item
    is encoded per the array's item schema.

    If a block's count is negative, then the count is followed immediately by a
    long block size, indicating the number of bytes in the block.  The actual
    count in this case is the absolute value of the count written.
    """
    if reader_schema:
        def item_reader(fo, w_schema, r_schema):
            return read_data(fo, w_schema['items'], r_schema['items'])
    else:
        def item_reader(fo, w_schema, _):
            return read_data(fo, w_schema['items'])

    read_items = []

    block_count = read_long(fo)

    while block_count != 0:
        if block_count < 0:
            block_count = -block_count
            # Read block size, unused
            read_long(fo)

        for i in xrange(block_count):
            read_items.append(item_reader(fo, writer_schema, reader_schema))
        block_count = read_long(fo)

    return read_items
Example #5
0
def read_map(fo, writer_schema, reader_schema=None):
    """Maps are encoded as a series of blocks.

    Each block consists of a long count value, followed by that many key/value
    pairs.  A block with count zero indicates the end of the map.  Each item is
    encoded per the map's value schema.

    If a block's count is negative, then the count is followed immediately by a
    long block size, indicating the number of bytes in the block.  The actual
    count in this case is the absolute value of the count written.
    """
    if reader_schema:
        def item_reader(fo, w_schema, r_schema):
            return read_data(fo, w_schema['values'], r_schema['values'])
    else:
        def item_reader(fo, w_schema, _):
            return read_data(fo, w_schema['values'])

    read_items = {}
    block_count = read_long(fo)
    while block_count != 0:
        if block_count < 0:
            block_count = -block_count
            # Read block size, unused
            read_long(fo)

        for i in xrange(block_count):
            key = read_utf8(fo)
            read_items[key] = item_reader(fo, writer_schema, reader_schema)
        block_count = read_long(fo)

    return read_items
Example #6
0
def read_array(fo, writer_schema, reader_schema=None):
    """Arrays are encoded as a series of blocks.

    Each block consists of a long count value, followed by that many array
    items.  A block with count zero indicates the end of the array.  Each item
    is encoded per the array's item schema.

    If a block's count is negative, then the count is followed immediately by a
    long block size, indicating the number of bytes in the block.  The actual
    count in this case is the absolute value of the count written.
    """
    if reader_schema:
        def item_reader(fo, w_schema, r_schema):
            return read_data(fo, w_schema['items'], r_schema['items'])
    else:
        def item_reader(fo, w_schema, _):
            return read_data(fo, w_schema['items'])

    read_items = []

    block_count = read_long(fo)

    while block_count != 0:
        if block_count < 0:
            block_count = -block_count
            # Read block size, unused
            read_long(fo)

        for i in xrange(block_count):
            read_items.append(item_reader(fo, writer_schema, reader_schema))
        block_count = read_long(fo)

    return read_items
Example #7
0
def _iter_avro(fo, header, codec, writer_schema, reader_schema):
    """Return iterator over avro records."""
    sync_marker = header['sync']
    # Value in schema is bytes

    read_block = BLOCK_READERS.get(codec)
    if not read_block:
        raise ValueError('Unrecognized codec: %r' % codec)

    block_count = 0
    while True:
        block_count = read_long(fo, None)
        block_fo = read_block(fo)

        for i in xrange(block_count):
            yield read_data(block_fo, writer_schema, reader_schema)

        skip_sync(fo, sync_marker)
Example #8
0
def _iter_avro(fo, header, codec, writer_schema, reader_schema):
    """Return iterator over avro records."""
    sync_marker = header['sync']
    # Value in schema is bytes

    read_block = BLOCK_READERS.get(codec)
    if not read_block:
        raise ValueError('Unrecognized codec: %r' % codec)

    block_count = 0
    while True:
        block_count = read_long(fo, None)
        block_fo = read_block(fo)

        for i in xrange(block_count):
            yield read_data(block_fo, writer_schema, reader_schema)

        skip_sync(fo, sync_marker)