def test_writer_class_sync_interval_automatic_flush(tmpdir): """ Create an Avro file using the Writer class with sync_interval set to 0. Verify that data does not accumulate in memory but is automatically flushed to the file object as each record is added. """ schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "field1", "type": { "type": "string" } }, { "name": "field2", "type": { "type": "int" } }] } records = [{ "field1": "test1", "field2": -1 }, { "field1": "test2", "field2": 5 }] temp_path = tmpdir.join('test_writer_class.avro') with temp_path.open('wb') as fo: w = Writer(fo, schema, codec='deflate', sync_interval=0) # Creating the Writer adds the Avro file header. Get file size with # header only. file_size_history = [fo.tell()] for i, record in enumerate(records): assert w.block_count == 0 w.write(record) # Verify records are being stored *in memory*: # 1. Block count increases # 2. File size does not increase assert w.block_count == 0 file_size_history.append(fo.tell()) assert file_size_history[-1] > file_size_history[-2] # Flushing the file writes the data. File size should increase now. w.flush() assert fo.tell() == file_size_history[-1] # Read the records to verify they were written correctly. new_reader = fastavro.reader(temp_path.open('rb')) new_records = list(new_reader) assert new_records == records
def test_writer_class_flush_end(tmpdir): """ Create an Avro file using the Writer class. Verify that data accumulates in memory and is written when flush() is called. """ schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "field1", "type": { "type": "string" } }, { "name": "field2", "type": { "type": "int" } }] } records = [{ "field1": "test1", "field2": -1 }, { "field1": "test2", "field2": 5 }] temp_path = tmpdir.join('test_writer_class.avro') with temp_path.open('wb') as fo: w = Writer(fo, schema, codec='deflate') # Creating the Writer adds the Avro file header. Get file size with # header only. size_with_header_only = fo.tell() for i, record in enumerate(records): assert w.block_count == i w.write(record) # Verify records are being stored *in memory*: # 1. Block count increases # 2. File size does not increase assert w.block_count == i + 1 assert fo.tell() == size_with_header_only # Flushing the file writes the data. File size should increase now. w.flush() assert fo.tell() > size_with_header_only # Read the records to verify they were written correctly. new_reader = fastavro.reader(temp_path.open('rb')) new_records = list(new_reader) assert new_records == records
def _write_toavro(table, target, mode, schema, sample, codec='deflate', compression_level=None, **avro_args): if table is None: return # build a schema when not defined by user if not schema: schema, table2 = _build_schema_from_values(table, sample) else: table2 = _fix_missing_headers(table, schema) # fastavro expects a iterator of dicts rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2) target2 = write_source_from_arg(target, mode=mode) with target2.open(mode) as target_file: # delay the import of fastavro for not breaking when unused from fastavro import parse_schema from fastavro.write import Writer parsed_schema = parse_schema(schema) writer = Writer(fo=target_file, schema=parsed_schema, codec=codec, compression_level=compression_level, **avro_args) num = 1 for record in rows: try: writer.write(record) num = num + 1 except ValueError as verr: vmsg = _get_error_details(target, num, verr, record, schema) _raise_error(ValueError, vmsg) except TypeError as terr: tmsg = _get_error_details(target, num, terr, record, schema) _raise_error(TypeError, tmsg) # finish writing writer.flush()
def write_avro_file(f, results_iter, fields, table, max_size=100 * 1024 ** 2): """Takes a database result set (list of dicts) and writes an avro file up to a particular size. If the schema 'name' is the same as an Avro data type (WRITERS.keys()) everything will break for no apparent reason. 'name' isn't even really used. Returns complete, row_count complete is true if the entire results_iter has been drained -- false if there are more records to be processed. row_count is the number of items written max_size is limit at which we should start writing another file. """ if table in WRITERS: table += "zzz" schema = {"type": "record", "name": table, "fields": fields} writer = Writer(f, schema) row_count = 0 complete = False try: # writer.io buffers before writing while f.tell() + writer.io.tell() < max_size: writer.write(_format_row(next(results_iter))) row_count += 1 except StopIteration: complete = True finally: writer.flush() return complete, row_count
def test_writer_class_split_files(tmpdir): """ Create 2 Avro files using the Writer class and the default sync_interval setting. We write to one file until the Writer automatically flushes, then write more records to the other file. Verify that the two files together contain all the records that were written. This simulates a real-world use case where a large Avro data set is split into files of approximately the same size. """ schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "field", "type": { "type": "string" } }] } records = [] def _append_record(writer_): record = {"field": "test{}".format(len(records))} records.append(record) writer_.write(record) temp_paths = [ tmpdir.join('test_writer_class1.avro'), tmpdir.join('test_writer_class2.avro') ] interim_record_counts = [] # First file: Write records until block_count goes back to 0 for the second # time. with temp_paths[0].open('wb') as fo: w = Writer(fo, schema, codec='deflate') _append_record(w) while w.block_count > 0: _append_record(w) _append_record(w) while w.block_count > 0: _append_record(w) w.flush() interim_record_counts.append(len(records)) # Second file: 100 records with temp_paths[1].open('wb') as fo: w = Writer(fo, schema, codec='deflate') for i in range(100): _append_record(w) w.flush() interim_record_counts.append(len(records)) assert interim_record_counts[1] == interim_record_counts[0] + 100 # Read the records to verify they were written correctly. new_records = [] new_interim_record_counts = [] for temp_path in temp_paths: new_reader = fastavro.reader(temp_path.open('rb')) new_records += list(new_reader) new_interim_record_counts.append(len(new_records)) assert new_records == records assert interim_record_counts == new_interim_record_counts