Example #1
0
    def __init__(self,
                 fo,
                 schema,
                 codec='null',
                 sync_interval=1000 * SYNC_SIZE,
                 metadata=None,
                 validator=None):
        self.fo = fo
        self.schema = schema
        self.validate_fn = validate if validator is True else validator
        self.sync_marker = urandom(SYNC_SIZE)
        self.io = MemoryIO()
        self.block_count = 0
        self.metadata = metadata or {}
        self.metadata['avro.codec'] = codec
        self.metadata['avro.schema'] = json.dumps(schema)
        self.sync_interval = sync_interval

        try:
            self.block_writer = BLOCK_WRITERS[codec]
        except KeyError:
            raise ValueError('unrecognized codec: %r' % codec)

        write_header(self.fo, self.metadata, self.sync_marker)
        acquaint_schema(self.schema)
Example #2
0
def writer(fo,
           schema,
           records,
           codec='null',
           sync_interval=1000 * SYNC_SIZE,
           metadata=None):
    sync_marker = urandom(SYNC_SIZE)
    io = MemoryIO()
    block_count = 0
    metadata = metadata or {}
    metadata['avro.codec'] = codec
    metadata['avro.schema'] = json.dumps(schema)

    try:
        block_writer = BLOCK_WRITERS[codec]
    except KeyError:
        raise ValueError('Unrecognized codec: {0!r}'.format(codec))

    def dump():
        write_long(fo, block_count)
        block_writer(fo, io.getvalue())
        fo.write(sync_marker)
        io.truncate(0)
        io.seek(0, SEEK_SET)

    write_header(fo, metadata, sync_marker)
    acquaint_schema(schema)

    for record in records:
        write_data(io, record, schema)
        block_count += 1
        if io.tell() >= sync_interval:
            dump()
            block_count = 0

    if io.tell() or block_count > 0:
        dump()

    fo.flush()
Example #3
0
class Writer(object):
    def __init__(self,
                 fo,
                 schema,
                 codec='null',
                 sync_interval=1000 * SYNC_SIZE,
                 metadata=None,
                 validator=None):
        self.fo = fo
        self.schema = schema
        self.validate_fn = validate if validator is True else validator
        self.sync_marker = urandom(SYNC_SIZE)
        self.io = MemoryIO()
        self.block_count = 0
        self.metadata = metadata or {}
        self.metadata['avro.codec'] = codec
        self.metadata['avro.schema'] = json.dumps(schema)
        self.sync_interval = sync_interval

        try:
            self.block_writer = BLOCK_WRITERS[codec]
        except KeyError:
            raise ValueError('unrecognized codec: %r' % codec)

        write_header(self.fo, self.metadata, self.sync_marker)
        acquaint_schema(self.schema)

    def dump(self):
        write_long(self.fo, self.block_count)
        self.block_writer(self.fo, self.io.getvalue())
        self.fo.write(self.sync_marker)
        self.io.truncate(0)
        self.io.seek(0, SEEK_SET)
        self.block_count = 0

    def write(self, record):
        if self.validate_fn:
            self.validate_fn(record, self.schema)
        write_data(self.io, record, self.schema)
        self.block_count += 1
        if self.io.tell() >= self.sync_interval:
            self.dump()

    def flush(self):
        if self.io.tell() or self.block_count > 0:
            self.dump()
        self.fo.flush()
Example #4
0
def writer(fo,
           schema,
           records,
           codec='null',
           sync_interval=1000 * SYNC_SIZE,
           metadata=None):
    """Write records to fo (stream) according to schema

    Paramaters
    ----------
    fo: file like
        Output stream
    records: iterable
        Records to write
    codec: string, optional
        Compression codec, can be 'null', 'deflate' or 'snappy' (if installed)
    sync_interval: int, optional
        Size of sync interval
    metadata: dict, optional
        Header metadata


    Example
    -------

    >>> from fastavro import writer

    >>> schema = {
    >>>     'doc': 'A weather reading.',
    >>>     'name': 'Weather',
    >>>     'namespace': 'test',
    >>>     'type': 'record',
    >>>     'fields': [
    >>>         {'name': 'station', 'type': 'string'},
    >>>         {'name': 'time', 'type': 'long'},
    >>>         {'name': 'temp', 'type': 'int'},
    >>>     ],
    >>> }

    >>> records = [
    >>>     {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},
    >>>     {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},
    >>>     {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},
    >>>     {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},
    >>> ]

    >>> with open('weather.avro', 'wb') as out:
    >>>     writer(out, schema, records)
    """
    sync_marker = urandom(SYNC_SIZE)
    io = MemoryIO()
    block_count = 0
    metadata = metadata or {}
    metadata['avro.codec'] = codec
    metadata['avro.schema'] = json.dumps(schema)

    try:
        block_writer = BLOCK_WRITERS[codec]
    except KeyError:
        raise ValueError('unrecognized codec: %r' % codec)

    def dump():
        write_long(fo, block_count)
        block_writer(fo, io.getvalue())
        fo.write(sync_marker)
        io.truncate(0)
        io.seek(0, SEEK_SET)

    write_header(fo, metadata, sync_marker)
    acquaint_schema(schema)

    for record in records:
        write_data(io, record, schema)
        block_count += 1
        if io.tell() >= sync_interval:
            dump()
            block_count = 0

    if io.tell() or block_count > 0:
        dump()

    fo.flush()
Example #5
0
def deflate_read_block(fo):
    """Read block in "deflate" codec."""
    data = read_bytes(fo, None)
    # -15 is the log of the window size; negative indicates "raw" (no
    # zlib headers) decompression.  See zlib.h.
    return MemoryIO(decompress(data, -15))