def test_header_skip_rows(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" opts = ReadOptions() opts.skip_rows = 1 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["ef", "gh"]) assert table.to_pydict() == { "ef": ["ij", "mn"], "gh": ["kl", "op"], } opts.skip_rows = 3 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["mn", "op"]) assert table.to_pydict() == { "mn": [], "op": [], } opts.skip_rows = 4 with pytest.raises(pa.ArrowInvalid): # Not enough rows table = self.read_bytes(rows, read_options=opts) # Can skip rows with a different number of columns rows = b"abcd\n,,,,,\nij,kl\nmn,op\n" opts.skip_rows = 2 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["ij", "kl"]) assert table.to_pydict() == { "ij": ["mn"], "kl": ["op"], }
def test_header_column_names(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" opts = ReadOptions() opts.column_names = ["x", "y"] table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": ["ab", "ef", "ij", "mn"], "y": ["cd", "gh", "kl", "op"], } opts.skip_rows = 3 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": ["mn"], "y": ["op"], } opts.skip_rows = 4 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": [], "y": [], } opts.skip_rows = 5 with pytest.raises(pa.ArrowInvalid): # Not enough rows table = self.read_bytes(rows, read_options=opts) # Unexpected number of columns opts.skip_rows = 0 opts.column_names = ["x", "y", "z"] with pytest.raises(pa.ArrowInvalid, match="Expected 3 columns, got 2"): table = self.read_bytes(rows, read_options=opts) # Can skip rows with a different number of columns rows = b"abcd\n,,,,,\nij,kl\nmn,op\n" opts.skip_rows = 2 opts.column_names = ["x", "y"] table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": ["ij", "mn"], "y": ["kl", "op"], }
def test_header_autogenerate_column_names(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" opts = ReadOptions() opts.autogenerate_column_names = True table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["f0", "f1"]) assert table.to_pydict() == { "f0": ["ab", "ef", "ij", "mn"], "f1": ["cd", "gh", "kl", "op"], } opts.skip_rows = 3 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["f0", "f1"]) assert table.to_pydict() == { "f0": ["mn"], "f1": ["op"], } # Not enough rows, impossible to infer number of columns opts.skip_rows = 4 with pytest.raises(pa.ArrowInvalid): table = self.read_bytes(rows, read_options=opts)