Ejemplo n.º 1
0
def test_concatenation_table_cast(blocks_type, in_memory_pa_table,
                                  in_memory_blocks, memory_mapped_blocks,
                                  mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types
    assert pa.int64() in in_memory_pa_table.schema.types
    schema = pa.schema({
        k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32())
        for k, v in zip(in_memory_pa_table.schema.names,
                        in_memory_pa_table.schema.types)
    })
    table = ConcatenationTable.from_blocks(blocks).cast(schema)
    assert table.table == in_memory_pa_table.cast(schema)
    assert isinstance(table, ConcatenationTable)
    schema = pa.schema({
        k: v if v != pa.int64() else pa.int32()
        for k, v in zip(in_memory_pa_table.schema.names,
                        in_memory_pa_table.schema.types)
    })
    table = ConcatenationTable.from_blocks(blocks).cast(schema)
    assert table.table == in_memory_pa_table.cast(schema)
    assert isinstance(table, ConcatenationTable)
Ejemplo n.º 2
0
def test_concatenation_table_cast(blocks_type, in_memory_pa_table,
                                  in_memory_blocks, memory_mapped_blocks,
                                  mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types
    assert pa.int64() in in_memory_pa_table.schema.types
    schema = pa.schema({
        k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32())
        for k, v in zip(in_memory_pa_table.schema.names,
                        in_memory_pa_table.schema.types)
    })
    if config.PYARROW_VERSION.major < 4:
        with pytest.raises(pa.ArrowNotImplementedError):
            ConcatenationTable.from_blocks(blocks).cast(schema)
    else:
        table = ConcatenationTable.from_blocks(blocks).cast(schema)
        assert table.table == in_memory_pa_table.cast(schema)
        assert isinstance(table, ConcatenationTable)
    schema = pa.schema({
        k: v if v != pa.int64() else pa.int32()
        for k, v in zip(in_memory_pa_table.schema.names,
                        in_memory_pa_table.schema.types)
    })
    table = ConcatenationTable.from_blocks(blocks).cast(schema)
    assert table.table == in_memory_pa_table.cast(schema)
    assert isinstance(table, ConcatenationTable)
Ejemplo n.º 3
0
def test_concatenation_table_append_column(
        blocks_type, in_memory_pa_table, in_memory_blocks,
        memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    field_ = "new_field"
    column = pa.array([i for i in range(len(in_memory_pa_table))])
    with pytest.raises(NotImplementedError):
        ConcatenationTable.from_blocks(blocks).append_column(field_, column)
Ejemplo n.º 4
0
def test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file):
    in_memory_table = InMemoryTable(in_memory_pa_table)
    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)
    memory_mapped_table = MemoryMappedTable.from_file(arrow_file)
    tables = [
        in_memory_pa_table, in_memory_table, concatenation_table,
        memory_mapped_table
    ]
    if axis == 0:
        expected_table = pa.concat_tables([in_memory_pa_table] * len(tables))
    else:
        expected_table = in_memory_pa_table
        for _ in range(1, len(tables)):
            for name, col in zip(in_memory_pa_table.column_names,
                                 in_memory_pa_table.columns):
                expected_table = expected_table.append_column(name, col)

    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_tables(tables, axis=axis)
    assert isinstance(table, ConcatenationTable)
    assert table.table == expected_table
    # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable
    assert len(table.blocks) == 1 if axis == 1 else 2
    assert len(table.blocks[0]) == 1 if axis == 0 else 2
    assert axis == 1 or len(table.blocks[1]) == 1
    assert isinstance(table.blocks[0][0], InMemoryTable)
    assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1],
                      MemoryMappedTable)
Ejemplo n.º 5
0
def test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file):
    in_memory_table = InMemoryTable(in_memory_pa_table)
    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)
    memory_mapped_table = MemoryMappedTable.from_file(arrow_file)
    tables = [
        in_memory_pa_table, in_memory_table, concatenation_table,
        memory_mapped_table
    ]
    if axis == 0:
        expected_table = pa.concat_tables([in_memory_pa_table] * len(tables))
    else:
        # avoids error due to duplicate column names
        tables[1:] = [
            add_suffix_to_column_names(table, i)
            for i, table in enumerate(tables[1:], 1)
        ]
        expected_table = in_memory_pa_table
        for table in tables[1:]:
            for name, col in zip(table.column_names, table.columns):
                expected_table = expected_table.append_column(name, col)

    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_tables(tables, axis=axis)
    assert isinstance(table, ConcatenationTable)
    assert table.table == expected_table
    # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable
    assert len(table.blocks) == 1 if axis == 1 else 2
    assert len(table.blocks[0]) == 1 if axis == 0 else 2
    assert axis == 1 or len(table.blocks[1]) == 1
    assert isinstance(table.blocks[0][0], InMemoryTable)
    assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1],
                      MemoryMappedTable)
Ejemplo n.º 6
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    tables = [t0, t1, t2, t3]
    concatenated_table = concat_tables(tables, axis=0)
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert concatenated_table.table.shape == (40, 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks
               ) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)
    # add suffix to avoid error due to duplicate column names
    concatenated_table = concat_tables([
        add_suffix_to_column_names(table, i) for i, table in enumerate(tables)
    ],
                                       axis=1)
    assert concatenated_table.table.shape == (10, 16)
    assert len(concatenated_table.blocks[0]
               ) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
Ejemplo n.º 7
0
def test_concatenation_table_from_blocks(in_memory_pa_table, in_memory_blocks):
    assert len(in_memory_pa_table) > 2
    in_memory_table = InMemoryTable(in_memory_pa_table)
    t1, t2 = in_memory_table.slice(0, 2), in_memory_table.slice(2)
    table = ConcatenationTable.from_blocks(in_memory_table)
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == [[in_memory_table]]
    table = ConcatenationTable.from_blocks([t1, t2])
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == [[in_memory_table]]
    table = ConcatenationTable.from_blocks([[t1], [t2]])
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == [[in_memory_table]]
    table = ConcatenationTable.from_blocks(in_memory_blocks)
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == [[in_memory_table]]
Ejemplo n.º 8
0
def test_concatenation_table_remove_column(
        blocks_type, in_memory_pa_table, in_memory_blocks,
        memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    table = ConcatenationTable.from_blocks(blocks).remove_column(0)
    assert table.table == in_memory_pa_table.remove_column(0)
    assert isinstance(table, ConcatenationTable)
Ejemplo n.º 9
0
def test_concatenation_table_drop(blocks_type, in_memory_pa_table,
                                  in_memory_blocks, memory_mapped_blocks,
                                  mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    names = [in_memory_pa_table.column_names[0]]
    table = ConcatenationTable.from_blocks(blocks).drop(names)
    assert table.table == in_memory_pa_table.drop(names)
    assert isinstance(table, ConcatenationTable)
Ejemplo n.º 10
0
def test_concatenation_table_filter(blocks_type, in_memory_pa_table,
                                    in_memory_blocks, memory_mapped_blocks,
                                    mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])
    table = ConcatenationTable.from_blocks(blocks).filter(mask)
    assert table.table == in_memory_pa_table.filter(mask)
    assert isinstance(table, ConcatenationTable)
def test_concatenation_table_replace_schema_metadata(
    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    metadata = {"huggingface": "{}"}
    table = ConcatenationTable.from_blocks(blocks).replace_schema_metadata(metadata)
    assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(metadata).schema.metadata
    assert isinstance(table, ConcatenationTable)
Ejemplo n.º 12
0
def test_concatenation_table_from_blocks_doesnt_increase_memory(
        blocks_type, in_memory_pa_table, in_memory_blocks,
        memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_blocks(blocks)
        assert isinstance(table, ConcatenationTable)
        assert table.table == in_memory_pa_table
        assert table.blocks == blocks
Ejemplo n.º 13
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    concatenated_table = concat_tables([t0, t1, t2, t3])
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 4
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
def test_concatenation_table_rename_columns(
    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    assert "tokens" in in_memory_pa_table.column_names
    names = [name if name != "tokens" else "new_tokens" for name in in_memory_pa_table.column_names]
    table = ConcatenationTable.from_blocks(blocks).rename_columns(names)
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table.rename_columns(names)
Ejemplo n.º 15
0
def test_concatenation_table_pickle(blocks_type, in_memory_blocks,
                                    memory_mapped_blocks,
                                    mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    table = ConcatenationTable.from_blocks(blocks)
    pickled_table = pickle.dumps(table)
    unpickled_table = pickle.loads(pickled_table)
    assert unpickled_table.table == table.table
    assert unpickled_table.blocks == table.blocks
    assert_index_attributes_equal(table, unpickled_table)
Ejemplo n.º 16
0
def test_concatenation_table_from_tables(in_memory_pa_table):
    in_memory_table = InMemoryTable(in_memory_pa_table)
    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)
    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_tables(
            [in_memory_pa_table, in_memory_table, concatenation_table])
        assert table.table == pa.concat_tables([in_memory_pa_table] * 3)
        assert isinstance(table, ConcatenationTable)
        assert len(table.blocks) == 3
        assert all(len(tables) == 1 for tables in table.blocks)
        assert all(
            isinstance(tables[0], InMemoryTable) for tables in table.blocks)
        assert all(tables[0].table == in_memory_pa_table
                   for tables in table.blocks)
def test_concatenation_table_deepcopy(
    blocks_type, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    table = ConcatenationTable.from_blocks(blocks)
    copied_table = copy.deepcopy(table)
    assert table.table == copied_table.table
    assert table.blocks == copied_table.blocks
    assert_index_attributes_equal(table, copied_table)
    # deepcopy must return the exact same arrow objects since they are immutable
    assert table.table is copied_table.table
    assert all(batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))
Ejemplo n.º 18
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    tables = [t0, t1, t2, t3]
    concatenated_table = concat_tables(tables, axis=0)
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert concatenated_table.table.shape == (40, 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)
    concatenated_table = concat_tables(tables, axis=1)
    assert concatenated_table.table.shape == (10, 16)
    assert len(concatenated_table.blocks[0]) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)