def test_writer_class_split_files(tmpdir): """ Create 2 Avro files using the Writer class and the default sync_interval setting. We write to one file until the Writer automatically flushes, then write more records to the other file. Verify that the two files together contain all the records that were written. This simulates a real-world use case where a large Avro data set is split into files of approximately the same size. """ schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "field", "type": { "type": "string" } }] } records = [] def _append_record(writer_): record = {"field": "test{}".format(len(records))} records.append(record) writer_.write(record) temp_paths = [ tmpdir.join('test_writer_class1.avro'), tmpdir.join('test_writer_class2.avro') ] interim_record_counts = [] # First file: Write records until block_count goes back to 0 for the second # time. with temp_paths[0].open('wb') as fo: w = Writer(fo, schema, codec='deflate') _append_record(w) while w.block_count > 0: _append_record(w) _append_record(w) while w.block_count > 0: _append_record(w) w.flush() interim_record_counts.append(len(records)) # Second file: 100 records with temp_paths[1].open('wb') as fo: w = Writer(fo, schema, codec='deflate') for i in range(100): _append_record(w) w.flush() interim_record_counts.append(len(records)) assert interim_record_counts[1] == interim_record_counts[0] + 100 # Read the records to verify they were written correctly. new_records = [] new_interim_record_counts = [] for temp_path in temp_paths: new_reader = fastavro.reader(temp_path.open('rb')) new_records += list(new_reader) new_interim_record_counts.append(len(new_records)) assert new_records == records assert interim_record_counts == new_interim_record_counts
def open(self, temp_path): self.file_handle = super().open(temp_path) return Writer(self.file_handle, self._schema, self._codec)
def open(self, temp_path): file_handle = super(_AvroSink, self).open(temp_path) return Writer(file_handle, self._schema.to_json(), self._codec)
def open(self, temp_path): file_handle = super(_FastAvroSink, self).open(temp_path) return Writer(file_handle, self._schema, self._codec)
def open(self, temp_path): # TODO(BEAM-4749): fastavro fails to install in MacOS. from fastavro.write import Writer # pylint: disable=wrong-import-position file_handle = super(_AvroSink, self).open(temp_path) return Writer(file_handle, self._schema.to_json(), self._codec)