def __init__(self, fo, schema, codec='null', sync_interval=1000 * SYNC_SIZE, metadata=None, validator=None): self.fo = fo self.schema = schema self.validate_fn = validate if validator is True else validator self.sync_marker = urandom(SYNC_SIZE) self.io = MemoryIO() self.block_count = 0 self.metadata = metadata or {} self.metadata['avro.codec'] = codec self.metadata['avro.schema'] = json.dumps(schema) self.sync_interval = sync_interval try: self.block_writer = BLOCK_WRITERS[codec] except KeyError: raise ValueError('unrecognized codec: %r' % codec) write_header(self.fo, self.metadata, self.sync_marker) acquaint_schema(self.schema)
def writer(fo, schema, records, codec='null', sync_interval=1000 * SYNC_SIZE, metadata=None): sync_marker = urandom(SYNC_SIZE) io = MemoryIO() block_count = 0 metadata = metadata or {} metadata['avro.codec'] = codec metadata['avro.schema'] = json.dumps(schema) try: block_writer = BLOCK_WRITERS[codec] except KeyError: raise ValueError('Unrecognized codec: {0!r}'.format(codec)) def dump(): write_long(fo, block_count) block_writer(fo, io.getvalue()) fo.write(sync_marker) io.truncate(0) io.seek(0, SEEK_SET) write_header(fo, metadata, sync_marker) acquaint_schema(schema) for record in records: write_data(io, record, schema) block_count += 1 if io.tell() >= sync_interval: dump() block_count = 0 if io.tell() or block_count > 0: dump() fo.flush()
class Writer(object): def __init__(self, fo, schema, codec='null', sync_interval=1000 * SYNC_SIZE, metadata=None, validator=None): self.fo = fo self.schema = schema self.validate_fn = validate if validator is True else validator self.sync_marker = urandom(SYNC_SIZE) self.io = MemoryIO() self.block_count = 0 self.metadata = metadata or {} self.metadata['avro.codec'] = codec self.metadata['avro.schema'] = json.dumps(schema) self.sync_interval = sync_interval try: self.block_writer = BLOCK_WRITERS[codec] except KeyError: raise ValueError('unrecognized codec: %r' % codec) write_header(self.fo, self.metadata, self.sync_marker) acquaint_schema(self.schema) def dump(self): write_long(self.fo, self.block_count) self.block_writer(self.fo, self.io.getvalue()) self.fo.write(self.sync_marker) self.io.truncate(0) self.io.seek(0, SEEK_SET) self.block_count = 0 def write(self, record): if self.validate_fn: self.validate_fn(record, self.schema) write_data(self.io, record, self.schema) self.block_count += 1 if self.io.tell() >= self.sync_interval: self.dump() def flush(self): if self.io.tell() or self.block_count > 0: self.dump() self.fo.flush()
def writer(fo, schema, records, codec='null', sync_interval=1000 * SYNC_SIZE, metadata=None): """Write records to fo (stream) according to schema Paramaters ---------- fo: file like Output stream records: iterable Records to write codec: string, optional Compression codec, can be 'null', 'deflate' or 'snappy' (if installed) sync_interval: int, optional Size of sync interval metadata: dict, optional Header metadata Example ------- >>> from fastavro import writer >>> schema = { >>> 'doc': 'A weather reading.', >>> 'name': 'Weather', >>> 'namespace': 'test', >>> 'type': 'record', >>> 'fields': [ >>> {'name': 'station', 'type': 'string'}, >>> {'name': 'time', 'type': 'long'}, >>> {'name': 'temp', 'type': 'int'}, >>> ], >>> } >>> records = [ >>> {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388}, >>> {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389}, >>> {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379}, >>> {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478}, >>> ] >>> with open('weather.avro', 'wb') as out: >>> writer(out, schema, records) """ sync_marker = urandom(SYNC_SIZE) io = MemoryIO() block_count = 0 metadata = metadata or {} metadata['avro.codec'] = codec metadata['avro.schema'] = json.dumps(schema) try: block_writer = BLOCK_WRITERS[codec] except KeyError: raise ValueError('unrecognized codec: %r' % codec) def dump(): write_long(fo, block_count) block_writer(fo, io.getvalue()) fo.write(sync_marker) io.truncate(0) io.seek(0, SEEK_SET) write_header(fo, metadata, sync_marker) acquaint_schema(schema) for record in records: write_data(io, record, schema) block_count += 1 if io.tell() >= sync_interval: dump() block_count = 0 if io.tell() or block_count > 0: dump() fo.flush()
def deflate_read_block(fo): """Read block in "deflate" codec.""" data = read_bytes(fo, None) # -15 is the log of the window size; negative indicates "raw" (no # zlib headers) decompression. See zlib.h. return MemoryIO(decompress(data, -15))