def test_memory_mapped_table_filter(arrow_file, in_memory_pa_table):
    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])
    table = MemoryMappedTable.from_file(arrow_file).filter(mask)
    assert table.table == in_memory_pa_table.filter(mask)
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("filter", (mask, ), {})]
    assert_deepcopy_without_bringing_data_in_memory(table)
    # filter DOES increase memory
    # assert_pickle_without_bringing_data_in_memory(table)
    assert_pickle_does_bring_data_in_memory(table)
Exemple #2
0
def test_memory_mapped_table_set_column(arrow_file, in_memory_pa_table):
    i = len(in_memory_pa_table.column_names)
    field_ = "new_field"
    column = pa.array([i for i in range(len(in_memory_pa_table))])
    table = MemoryMappedTable.from_file(arrow_file).set_column(i, field_, column)
    assert table.table == in_memory_pa_table.set_column(i, field_, column)
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("set_column", (i, field_, column), {})]
    assert_deepcopy_without_bringing_data_in_memory(table)
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #3
0
def test_memory_mapped_table_from_file_with_replay(arrow_file, in_memory_pa_table):
    replays = [("slice", (0, 1), {}), ("flatten", tuple(), {})]
    with assert_arrow_memory_doesnt_increase():
        table = MemoryMappedTable.from_file(arrow_file, replays=replays)
    assert len(table) == 1
    for method, args, kwargs in replays:
        in_memory_pa_table = getattr(in_memory_pa_table, method)(*args, **kwargs)
    assert table.table == in_memory_pa_table
    assert_deepcopy_without_bringing_data_in_memory(table)
    assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_rename_columns(arrow_file, in_memory_pa_table):
    assert "tokens" in in_memory_pa_table.column_names
    names = [
        name if name != "tokens" else "new_tokens"
        for name in in_memory_pa_table.column_names
    ]
    table = MemoryMappedTable.from_file(arrow_file).rename_columns(names)
    assert table.table == in_memory_pa_table.rename_columns(names)
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("rename_columns", (names, ), {})]
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #5
0
def test_memory_mapped_table_deepcopy(arrow_file):
    table = MemoryMappedTable.from_file(arrow_file)
    copied_table = copy.deepcopy(table)
    assert table.table == copied_table.table
    assert table.path == copied_table.path
    assert_index_attributes_equal(table, copied_table)
    # deepcopy must return the exact same arrow objects since they are immutable
    assert table.table is copied_table.table
    assert all(
        batch1 is batch2
        for batch1, batch2 in zip(table._batches, copied_table._batches))
Exemple #6
0
def test_memory_mapped_table_replace_schema_metadata(arrow_file,
                                                     in_memory_pa_table):
    metadata = {"huggingface": "{}"}
    table = MemoryMappedTable.from_file(arrow_file).replace_schema_metadata(
        metadata)
    assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(
        metadata).schema.metadata
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("replace_schema_metadata", (metadata, ), {})]
    assert_deepcopy_without_bringing_data_in_memory(table)
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #7
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    concatenated_table = concat_tables([t0, t1, t2, t3])
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 4
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
def test_memory_mapped_table_cast(arrow_file, in_memory_pa_table):
    assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types
    schema = pa.schema({
        k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32())
        for k, v in zip(in_memory_pa_table.schema.names,
                        in_memory_pa_table.schema.types)
    })
    table = MemoryMappedTable.from_file(arrow_file).cast(schema)
    assert table.table == in_memory_pa_table.cast(schema)
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("cast", (schema, ), {})]
    # cast DOES increase memory when converting integers precision for example
    # assert_pickle_without_bringing_data_in_memory(table)
    assert_pickle_does_bring_data_in_memory(table)
Exemple #9
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    tables = [t0, t1, t2, t3]
    concatenated_table = concat_tables(tables, axis=0)
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert concatenated_table.table.shape == (40, 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)
    concatenated_table = concat_tables(tables, axis=1)
    assert concatenated_table.table.shape == (10, 16)
    assert len(concatenated_table.blocks[0]) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
Exemple #10
0
def test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file):
    in_memory_table = InMemoryTable(in_memory_pa_table)
    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)
    memory_mapped_table = MemoryMappedTable.from_file(arrow_file)
    tables = [in_memory_pa_table, in_memory_table, concatenation_table, memory_mapped_table]
    if axis == 0:
        expected_table = pa.concat_tables([in_memory_pa_table] * len(tables))
    else:
        expected_table = in_memory_pa_table
        for _ in range(1, len(tables)):
            for name, col in zip(in_memory_pa_table.column_names, in_memory_pa_table.columns):
                expected_table = expected_table.append_column(name, col)

    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_tables(tables, axis=axis)
    assert isinstance(table, ConcatenationTable)
    assert table.table == expected_table
    # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable
    assert len(table.blocks) == 1 if axis == 1 else 2
    assert len(table.blocks[0]) == 1 if axis == 0 else 2
    assert axis == 1 or len(table.blocks[1]) == 1
    assert isinstance(table.blocks[0][0], InMemoryTable)
    assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1], MemoryMappedTable)
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    tables = [t0, t1, t2, t3]
    concatenated_table = concat_tables(tables, axis=0)
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert concatenated_table.table.shape == (40, 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)
    # add suffix to avoid error due to duplicate column names
    concatenated_table = concat_tables(
        [add_suffix_to_column_names(table, i) for i, table in enumerate(tables)], axis=1
    )
    assert concatenated_table.table.shape == (10, 16)
    assert len(concatenated_table.blocks[0]) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
Exemple #12
0
def test_memory_mapped_table_slice(arrow_file, in_memory_pa_table):
    table = MemoryMappedTable.from_file(arrow_file).slice(1, 2)
    assert table.table == in_memory_pa_table.slice(1, 2)
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("slice", (1, 2), {})]
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #13
0
def memory_mapped_blocks(arrow_file):
    table = MemoryMappedTable.from_file(arrow_file)
    return _to_testing_blocks(table)
Exemple #14
0
def test_memory_mapped_table_pickle_doesnt_fill_memory(arrow_file):
    with assert_arrow_memory_doesnt_increase():
        table = MemoryMappedTable.from_file(arrow_file)
    assert_deepcopy_without_bringing_data_in_memory(table)
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #15
0
def test_memory_mapped_table_remove_column(arrow_file, in_memory_pa_table):
    table = MemoryMappedTable.from_file(arrow_file).remove_column(0)
    assert table.table == in_memory_pa_table.remove_column(0)
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("remove_column", (0, ), {})]
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #16
0
def test_memory_mapped_table_combine_chunks(arrow_file, in_memory_pa_table):
    table = MemoryMappedTable.from_file(arrow_file).combine_chunks()
    assert table.table == in_memory_pa_table.combine_chunks()
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("combine_chunks", tuple(), {})]
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #17
0
def test_memory_mapped_table_flatten(arrow_file, in_memory_pa_table):
    table = MemoryMappedTable.from_file(arrow_file).flatten()
    assert table.table == in_memory_pa_table.flatten()
    assert isinstance(table, MemoryMappedTable)
    assert table.replays == [("flatten", tuple(), {})]
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #18
0
def test_memory_mapped_table_from_file(arrow_file, in_memory_pa_table):
    with assert_arrow_memory_doesnt_increase():
        table = MemoryMappedTable.from_file(arrow_file)
    assert table.table == in_memory_pa_table
    assert isinstance(table, MemoryMappedTable)
    assert_pickle_without_bringing_data_in_memory(table)
Exemple #19
0
def test_memory_mapped_table_init(arrow_file, in_memory_pa_table):
    table = MemoryMappedTable(_memory_mapped_arrow_table_from_file(arrow_file),
                              arrow_file)
    assert table.table == in_memory_pa_table
    assert isinstance(table, MemoryMappedTable)
    assert_pickle_without_bringing_data_in_memory(table)