def test_concatenation_table_flatten(blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] table = ConcatenationTable.from_blocks(blocks).flatten() assert table.table == in_memory_pa_table.flatten() assert isinstance(table, ConcatenationTable)
def test_concatenation_table_from_blocks(in_memory_pa_table, in_memory_blocks): assert len(in_memory_pa_table) > 2 in_memory_table = InMemoryTable(in_memory_pa_table) t1, t2 = in_memory_table.slice(0, 2), in_memory_table.slice(2) table = ConcatenationTable.from_blocks(in_memory_table) assert isinstance(table, ConcatenationTable) assert table.table == in_memory_pa_table assert table.blocks == [[in_memory_table]] table = ConcatenationTable.from_blocks([t1, t2]) assert isinstance(table, ConcatenationTable) assert table.blocks == [[t1], [t2]] assert table.table == in_memory_pa_table table = ConcatenationTable.from_blocks([[t1], [t2]]) assert isinstance(table, ConcatenationTable) assert table.table == in_memory_pa_table assert table.blocks == [[t1], [t2]] table = ConcatenationTable.from_blocks(in_memory_blocks) assert isinstance(table, ConcatenationTable) assert table.table == in_memory_pa_table assert table.blocks == in_memory_blocks
def test_concatenation_table_drop(blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] names = [in_memory_pa_table.column_names[0]] table = ConcatenationTable.from_blocks(blocks).drop(names) assert table.table == in_memory_pa_table.drop(names) assert isinstance(table, ConcatenationTable)
def test_concatenation_table_filter(blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))]) table = ConcatenationTable.from_blocks(blocks).filter(mask) assert table.table == in_memory_pa_table.filter(mask) assert isinstance(table, ConcatenationTable)
def test_concatenation_table_replace_schema_metadata( blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks ): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] metadata = {"huggingface": "{}"} table = ConcatenationTable.from_blocks(blocks).replace_schema_metadata(metadata) assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(metadata).schema.metadata assert isinstance(table, ConcatenationTable)
def test_concatenation_table_rename_columns( blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks ): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] assert "tokens" in in_memory_pa_table.column_names names = [name if name != "tokens" else "new_tokens" for name in in_memory_pa_table.column_names] table = ConcatenationTable.from_blocks(blocks).rename_columns(names) assert isinstance(table, ConcatenationTable) assert table.table == in_memory_pa_table.rename_columns(names)
def test_concatenation_table_init( blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks ): blocks = ( in_memory_blocks if blocks_type == "in_memory" else memory_mapped_blocks if blocks_type == "memory_mapped" else mixed_in_memory_and_memory_mapped_blocks ) table = ConcatenationTable(in_memory_pa_table, blocks) assert table.table == in_memory_pa_table assert table.blocks == blocks
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) concatenated_table = concat_tables([t0, t1, t2, t3]) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 4 assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], InMemoryTable) assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
def test_concatenation_table_from_blocks_doesnt_increase_memory( blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] with assert_arrow_memory_doesnt_increase(): table = ConcatenationTable.from_blocks(blocks) assert isinstance(table, ConcatenationTable) assert table.table == in_memory_pa_table assert table.blocks == blocks
def test_concatenation_table_pickle(blocks_type, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] table = ConcatenationTable.from_blocks(blocks) pickled_table = pickle.dumps(table) unpickled_table = pickle.loads(pickled_table) assert unpickled_table.table == table.table assert unpickled_table.blocks == table.blocks assert_index_attributes_equal(table, unpickled_table)
def test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file): in_memory_table = InMemoryTable(in_memory_pa_table) concatenation_table = ConcatenationTable.from_blocks(in_memory_table) memory_mapped_table = MemoryMappedTable.from_file(arrow_file) tables = [in_memory_pa_table, in_memory_table, concatenation_table, memory_mapped_table] if axis == 0: expected_table = pa.concat_tables([in_memory_pa_table] * len(tables)) else: expected_table = in_memory_pa_table for _ in range(1, len(tables)): for name, col in zip(in_memory_pa_table.column_names, in_memory_pa_table.columns): expected_table = expected_table.append_column(name, col) with assert_arrow_memory_doesnt_increase(): table = ConcatenationTable.from_tables(tables, axis=axis) assert isinstance(table, ConcatenationTable) assert table.table == expected_table # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable assert len(table.blocks) == 1 if axis == 1 else 2 assert len(table.blocks[0]) == 1 if axis == 0 else 2 assert axis == 1 or len(table.blocks[1]) == 1 assert isinstance(table.blocks[0][0], InMemoryTable) assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1], MemoryMappedTable)
def test_concatenation_table_deepcopy( blocks_type, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks ): blocks = { "in_memory": in_memory_blocks, "memory_mapped": memory_mapped_blocks, "mixed": mixed_in_memory_and_memory_mapped_blocks, }[blocks_type] table = ConcatenationTable.from_blocks(blocks) copied_table = copy.deepcopy(table) assert table.table == copied_table.table assert table.blocks == copied_table.blocks assert_index_attributes_equal(table, copied_table) # deepcopy must return the exact same arrow objects since they are immutable assert table.table is copied_table.table assert all(batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) tables = [t0, t1, t2, t3] concatenated_table = concat_tables(tables, axis=0) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert concatenated_table.table.shape == (40, 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[2][0], InMemoryTable) concatenated_table = concat_tables(tables, axis=1) assert concatenated_table.table.shape == (10, 16) assert len(concatenated_table.blocks[0]) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable) assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) tables = [t0, t1, t2, t3] concatenated_table = concat_tables(tables, axis=0) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert concatenated_table.table.shape == (40, 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[2][0], InMemoryTable) # add suffix to avoid error due to duplicate column names concatenated_table = concat_tables( [add_suffix_to_column_names(table, i) for i, table in enumerate(tables)], axis=1 ) assert concatenated_table.table.shape == (10, 16) assert len(concatenated_table.blocks[0]) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable) assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)