def test_writer_class_sync_interval_automatic_flush(tmpdir): """ Create an Avro file using the Writer class with sync_interval set to 0. Verify that data does not accumulate in memory but is automatically flushed to the file object as each record is added. """ schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "field1", "type": { "type": "string" } }, { "name": "field2", "type": { "type": "int" } }] } records = [{ "field1": "test1", "field2": -1 }, { "field1": "test2", "field2": 5 }] temp_path = tmpdir.join('test_writer_class.avro') with temp_path.open('wb') as fo: w = Writer(fo, schema, codec='deflate', sync_interval=0) # Creating the Writer adds the Avro file header. Get file size with # header only. file_size_history = [fo.tell()] for i, record in enumerate(records): assert w.block_count == 0 w.write(record) # Verify records are being stored *in memory*: # 1. Block count increases # 2. File size does not increase assert w.block_count == 0 file_size_history.append(fo.tell()) assert file_size_history[-1] > file_size_history[-2] # Flushing the file writes the data. File size should increase now. w.flush() assert fo.tell() == file_size_history[-1] # Read the records to verify they were written correctly. new_reader = fastavro.reader(temp_path.open('rb')) new_records = list(new_reader) assert new_records == records
def test_writer_class_flush_end(tmpdir): """ Create an Avro file using the Writer class. Verify that data accumulates in memory and is written when flush() is called. """ schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "field1", "type": { "type": "string" } }, { "name": "field2", "type": { "type": "int" } }] } records = [{ "field1": "test1", "field2": -1 }, { "field1": "test2", "field2": 5 }] temp_path = tmpdir.join('test_writer_class.avro') with temp_path.open('wb') as fo: w = Writer(fo, schema, codec='deflate') # Creating the Writer adds the Avro file header. Get file size with # header only. size_with_header_only = fo.tell() for i, record in enumerate(records): assert w.block_count == i w.write(record) # Verify records are being stored *in memory*: # 1. Block count increases # 2. File size does not increase assert w.block_count == i + 1 assert fo.tell() == size_with_header_only # Flushing the file writes the data. File size should increase now. w.flush() assert fo.tell() > size_with_header_only # Read the records to verify they were written correctly. new_reader = fastavro.reader(temp_path.open('rb')) new_records = list(new_reader) assert new_records == records
def _write_toavro(table, target, mode, schema, sample, codec='deflate', compression_level=None, **avro_args): if table is None: return # build a schema when not defined by user if not schema: schema, table2 = _build_schema_from_values(table, sample) else: table2 = _fix_missing_headers(table, schema) # fastavro expects a iterator of dicts rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2) target2 = write_source_from_arg(target, mode=mode) with target2.open(mode) as target_file: # delay the import of fastavro for not breaking when unused from fastavro import parse_schema from fastavro.write import Writer parsed_schema = parse_schema(schema) writer = Writer(fo=target_file, schema=parsed_schema, codec=codec, compression_level=compression_level, **avro_args) num = 1 for record in rows: try: writer.write(record) num = num + 1 except ValueError as verr: vmsg = _get_error_details(target, num, verr, record, schema) _raise_error(ValueError, vmsg) except TypeError as terr: tmsg = _get_error_details(target, num, terr, record, schema) _raise_error(TypeError, tmsg) # finish writing writer.flush()
def write_avro_file(f, results_iter, fields, table, max_size=100 * 1024 ** 2): """Takes a database result set (list of dicts) and writes an avro file up to a particular size. If the schema 'name' is the same as an Avro data type (WRITERS.keys()) everything will break for no apparent reason. 'name' isn't even really used. Returns complete, row_count complete is true if the entire results_iter has been drained -- false if there are more records to be processed. row_count is the number of items written max_size is limit at which we should start writing another file. """ if table in WRITERS: table += "zzz" schema = {"type": "record", "name": table, "fields": fields} writer = Writer(f, schema) row_count = 0 complete = False try: # writer.io buffers before writing while f.tell() + writer.io.tell() < max_size: writer.write(_format_row(next(results_iter))) row_count += 1 except StopIteration: complete = True finally: writer.flush() return complete, row_count