def test_include_missing_columns(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" read_options = ReadOptions() convert_options = ConvertOptions() convert_options.include_columns = ['xx', 'ab', 'yy'] convert_options.include_missing_columns = True table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('xx', pa.null()), ('ab', pa.string()), ('yy', pa.null())]) assert table.schema == schema assert table.to_pydict() == { "xx": [None, None, None], "ab": ["ef", "ij", "mn"], "yy": [None, None, None], } # Combining with `column_names` read_options.column_names = ["xx", "yy"] convert_options.include_columns = ["yy", "cd"] table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('yy', pa.string()), ('cd', pa.null())]) assert table.schema == schema assert table.to_pydict() == { "yy": ["cd", "gh", "kl", "op"], "cd": [None, None, None, None], } # And with `column_types` as well convert_options.column_types = {"yy": pa.binary(), "cd": pa.int32()} table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('yy', pa.binary()), ('cd', pa.int32())]) assert table.schema == schema assert table.to_pydict() == { "yy": [b"cd", b"gh", b"kl", b"op"], "cd": [None, None, None, None], }
def test_column_options(self): # With column_names rows = b"1,2,3\n4,5,6" read_options = ReadOptions() read_options.column_names = ['d', 'e', 'f'] reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()), ('f', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'd': [1, 4], 'e': [2, 5], 'f': [3, 6] }]) # With include_columns convert_options = ConvertOptions() convert_options.include_columns = ['f', 'e'] reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'e': [2, 5], 'f': [3, 6] }]) # With column_types convert_options.column_types = {'e': pa.string()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'e': ["2", "5"], 'f': [3, 6] }]) # Missing columns in include_columns convert_options.include_columns = ['g', 'f', 'e'] with pytest.raises( KeyError, match="Column 'g' in include_columns does not exist"): reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) convert_options.include_missing_columns = True reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }]) convert_options.column_types = {'e': pa.string(), 'g': pa.float64()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }])