def test_inference_failure(self): # Inference on first block, then conversion failure on second block rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n" read_options = ReadOptions() read_options.block_size = len(rows) - 7 reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) assert reader.schema == expected_schema assert reader.read_next_batch().to_pydict() == {'a': [123], 'b': [456]} # Second block with pytest.raises(ValueError, match="CSV conversion error to int64"): reader.read_next_batch() # EOF with pytest.raises(StopIteration): reader.read_next_batch() # Inference on first block, then conversion failure on second block, # then success on third block rows = b"a,b\n1,2\nabc,def\n45,67\n" read_options.block_size = 8 reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) assert reader.schema == expected_schema assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]} # Second block with pytest.raises(ValueError, match="CSV conversion error to int64"): reader.read_next_batch() # Third block assert reader.read_next_batch().to_pydict() == {'a': [45], 'b': [67]} # EOF with pytest.raises(StopIteration): reader.read_next_batch()
def test_batch_lifetime(self): gc.collect() old_allocated = pa.total_allocated_bytes() # Memory occupation should not grow with CSV file size def check_one_batch(reader, expected): batch = reader.read_next_batch() assert batch.to_pydict() == expected rows = b"10,11\n12,13\n14,15\n16,17\n" read_options = ReadOptions() read_options.column_names = ['a', 'b'] read_options.block_size = 6 reader = self.open_bytes(rows, read_options=read_options) check_one_batch(reader, {'a': [10], 'b': [11]}) allocated_after_first_batch = pa.total_allocated_bytes() check_one_batch(reader, {'a': [12], 'b': [13]}) assert pa.total_allocated_bytes() == allocated_after_first_batch check_one_batch(reader, {'a': [14], 'b': [15]}) assert pa.total_allocated_bytes() == allocated_after_first_batch check_one_batch(reader, {'a': [16], 'b': [17]}) assert pa.total_allocated_bytes() == allocated_after_first_batch with pytest.raises(StopIteration): reader.read_next_batch() assert pa.total_allocated_bytes() == old_allocated reader = None assert pa.total_allocated_bytes() == old_allocated
def test_invalid_csv(self): # CSV errors on first block rows = b"a,b\n1,2,3\n4,5\n6,7\n" read_options = ReadOptions() read_options.block_size = 10 with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"): reader = self.open_bytes(rows, read_options=read_options) # CSV errors on second block rows = b"a,b\n1,2\n3,4,5\n6,7\n" read_options.block_size = 8 reader = self.open_bytes(rows, read_options=read_options) assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]} with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"): reader.read_next_batch() # Cannot continue after a parse error with pytest.raises(StopIteration): reader.read_next_batch()
def test_inference(self): # Inference is done on first block rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n" expected_schema = pa.schema([('a', pa.string()), ('b', pa.binary())]) read_options = ReadOptions() read_options.block_size = len(rows) reader = self.open_bytes(rows, read_options=read_options) self.check_reader(reader, expected_schema, [{'a': ['123', 'abc', 'gh'], 'b': [b'456', b'de\xff', b'ij']}]) read_options.block_size = len(rows) - 1 reader = self.open_bytes(rows, read_options=read_options) self.check_reader(reader, expected_schema, [{'a': ['123', 'abc'], 'b': [b'456', b'de\xff']}, {'a': ['gh'], 'b': [b'ij']}])