Esempio n. 1
0
def test_bloom_index_datetime(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)")
    for i in range(50):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)",
                       (i + 1, dt(2015, 1, 1) + timedelta(days=i)))
    head = OUTPUT.commit(
        extra_indexes={"test": {
            "bloom": {
                "value_1": {
                    "probability": 0.01
                }
            }
        }})
    objects = head.get_table("test").objects

    # Datetimes are supported in the bloom index if they're passed in as actual datetime
    # objects (which Multicorn does).

    # Spot check some dates that exist...
    for i in range(0, 50, 5):
        assert (filter_bloom_index(OUTPUT.engine, objects, [[
            ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i))
        ]]) == objects)

    # ...and some that don't.
    assert (filter_bloom_index(
        OUTPUT.engine, objects,
        [[("value_1", "=", dt(2015, 1, 1) + timedelta(days=55))]]) == [])

    # They also work if passed in as ISO strings with space as a separator.
    assert (filter_bloom_index(
        OUTPUT.engine, objects,
        [[("value_1", "=", "2015-01-01 00:00:00")]]) == objects)
Esempio n. 2
0
def test_bloom_index_post_factum(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)")
    for i in range(50):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)",
                       (i + 1, dt(2015, 1, 1) + timedelta(days=i)))
    head = OUTPUT.commit()

    # Reindex the table after committing it and check that object metadata gets overwritten.
    head.get_table("test").reindex(
        extra_indexes={"bloom": {
            "value_1": {
                "probability": 0.01
            }
        }})

    obj = head.get_table("test").objects[0]
    object_index = OUTPUT.objects.get_object_meta([obj])[obj].object_index
    assert "bloom" in object_index

    # Run a snippet from the previous test to check the bloom metadata is valid.
    objects = head.get_table("test").objects
    for i in range(0, 50, 5):
        assert (filter_bloom_index(OUTPUT.engine, objects, [[
            ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i))
        ]]) == objects)
Esempio n. 3
0
def _setup_dataset():
    OUTPUT.init()
    OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar);
        INSERT INTO test VALUES (1, 'test')""")
    OUTPUT.commit()
    OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')")
    return OUTPUT.commit()
Esempio n. 4
0
def test_base_fragment_reused_chunking(local_engine_empty):
    # Check that if we split a table into chunks and some chunks are the same, they get assigned to the same objects.
    OUTPUT.init()
    base = OUTPUT.head
    _make_test_table(OUTPUT)
    OUTPUT.commit(chunk_size=5)
    table_1 = OUTPUT.head.get_table("test")
    # Table 1 produced 3 objects
    assert len(OUTPUT.objects.get_all_objects()) == 3

    # All chunks are the same
    base.checkout()
    _make_test_table(OUTPUT)
    OUTPUT.commit(chunk_size=5)
    table_2 = OUTPUT.head.get_table("test")
    assert len(OUTPUT.objects.get_all_objects()) == 3
    assert table_1.objects == table_2.objects

    # Insert something else into the middle chunk so that it's different. This will get conflated so won't get recorded
    # as an update.
    base.checkout()
    _make_test_table(OUTPUT)
    OUTPUT.run_sql(
        "UPDATE test SET value_1 = 'UPDATED', value_2 = 42 WHERE key = 7")
    OUTPUT.commit(chunk_size=5)
    table_3 = OUTPUT.head.get_table("test")
    assert len(OUTPUT.objects.get_all_objects()) == 4
    # Table 3 reused the first and the last object but created a new one for the middle fragment.
    assert len(table_3.objects) == 3
    assert table_3.objects[0] == table_1.objects[0]
    assert table_3.objects[1] != table_1.objects[1]
    assert table_3.objects[2] == table_1.objects[2]
Esempio n. 5
0
def test_layered_querying_json_arrays(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, "
        "value JSONB, arr_value INTEGER[], arr_2d_value TEXT[][])"
    )
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (1, %s, %s, %s)",
        (json.dumps({"a": 1, "b": 2.5}), [1, 2, 3], [["one", "two"], ["three", "four"]]),
    )
    OUTPUT.commit()
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (2, %s, %s, %s)",
        (
            json.dumps({"a": "one", "b": "two point five"}),
            [4, 5, 6],
            [["five", "six"], ["seven", "eight"]],
        ),
    )
    head = OUTPUT.commit()
    OUTPUT.uncheckout()
    head.checkout(layered=True)

    assert OUTPUT.run_sql("SELECT * FROM test ORDER BY key") == [
        (1, {"a": 1, "b": 2.5}, [1, 2, 3], [["one", "two"], ["three", "four"]]),
        (2, {"a": "one", "b": "two point five"}, [4, 5, 6], [["five", "six"], ["seven", "eight"]]),
    ]
Esempio n. 6
0
def test_import_and_update(local_engine_empty, unprivileged_pg_repo):
    OUTPUT.init()
    head = OUTPUT.head
    remote_head = unprivileged_pg_repo.images["latest"]
    # Import the 'fruits' table from the origin.
    import_table_from_remote(unprivileged_pg_repo, ["fruits"],
                             remote_head.image_hash,
                             OUTPUT,
                             target_tables=[])
    new_head = OUTPUT.head

    OUTPUT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    new_head_2 = OUTPUT.commit()

    head.checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")

    new_head.checkout()
    assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"),
                                                      (2, "orange")]

    new_head_2.checkout()
    assert OUTPUT.run_sql("SELECT * FROM fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
Esempio n. 7
0
def test_import_bare(pg_repo_local):
    # Check import without checking anything out, just by manipulating metadata and running LQs against
    # source images.

    # Create a new schema and import 'fruits'
    OUTPUT.init()
    # Make sure the existing table is preserved.
    OUTPUT.run_sql("CREATE TABLE sentinel (key INTEGER)")
    OUTPUT.commit()
    pg_repo_local.uncheckout()
    OUTPUT.uncheckout()

    OUTPUT.import_tables(
        tables=["imported_fruits"],
        source_repository=pg_repo_local,
        image_hash=pg_repo_local.images["latest"].image_hash,
        source_tables=["SELECT * FROM fruits WHERE fruit_id = 1"],
        parent_hash=OUTPUT.images["latest"].image_hash,
        do_checkout=False,
        table_queries=[True],
    )

    assert OUTPUT.head is None
    assert pg_repo_local.head is None

    assert sorted(OUTPUT.images["latest"].get_tables()) == [
        "imported_fruits", "sentinel"
    ]
    assert list(OUTPUT.images["latest"].get_table("imported_fruits").query(
        columns=["name"], quals=[])) == [{
            "name": "apple"
        }]
def test_commandline_show_empty_image(local_engine_empty):
    # Check size calculations etc in an empty image don't cause errors.
    runner = CliRunner()
    OUTPUT.init()
    assert OUTPUT.images["latest"].get_size() == 0
    result = runner.invoke(show_c, [str(OUTPUT) + ":" + "000000000000"],
                           catch_exceptions=False)
    assert "Size: 0.00 B" in result.output
Esempio n. 9
0
def test_bloom_index_deletions(local_engine_empty):
    # Check the bloom index fingerprint includes both the old and the new values of deleted/added cells.

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    # Insert 26 rows with value_1 spanning a-z
    for i in range(26):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    OUTPUT.commit()

    # Delete and update some rows
    OUTPUT.run_sql("DELETE FROM test WHERE key = 5")  # ('e', 8)
    OUTPUT.run_sql("DELETE FROM test WHERE key = 10")  # ('j', 18)
    OUTPUT.run_sql("DELETE FROM test WHERE key = 15")  # ('o', 28)
    OUTPUT.run_sql("UPDATE test SET value_1 = 'G' WHERE key = 7")  # (g -> G)
    OUTPUT.run_sql("UPDATE test SET value_2 = 23 WHERE key = 12")  # (22 -> 23)

    head = OUTPUT.commit(
        extra_indexes={
            "test": {
                "bloom": {
                    "value_1": {
                        "probability": 0.01
                    },
                    "value_2": {
                        "probability": 0.01
                    }
                }
            }
        })
    objects = head.get_table("test").objects

    # Sanity check: 2 objects (original data + new with 3 deletions and 2 upserts)
    assert len(objects) == 2
    assert (local_engine_empty.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta." + objects[1],
        return_shape=ResultShape.ONE_ONE) == 5)

    # Check old/new values for value_1: 3 old values before a deletion,
    # 1 old value before update, 1 updated value
    value_1_vals = ["e", "j", "o", "g", "G"]

    # value_2, same
    value_2_vals = [8, 18, 28, 22, 23]

    for val in value_1_vals:
        assert filter_bloom_index(OUTPUT.engine, objects,
                                  [[("value_1", "=", val)]]) == objects

    for val in value_2_vals:
        assert filter_bloom_index(OUTPUT.engine, objects,
                                  [[("value_2", "=", val)]]) == objects
Esempio n. 10
0
def test_rollback_on_error(local_engine_empty):
    # For e.g. commit/checkout/other commands, we don't do commits/rollbacks
    # in the library itself and expect the caller to manage transactions. In CLI,
    # we need to make sure that erroneous transactions (e.g. interrupted SG commits)
    # are rolled back correctly instead of being committed.
    runner = CliRunner()

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    for i in range(11):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    head = OUTPUT.commit(chunk_size=5,
                         in_fragment_order={"test": ["key", "value_1"]})
    assert len(OUTPUT.images()) == 2
    assert len(OUTPUT.objects.get_all_objects()) == 3

    _alter_diff_splitting_dataset()
    OUTPUT.commit_engines()

    # Simulate the commit getting interrupted by the first object going through and being
    # recorded, then a KeyboardInterrupt being raised.
    called_once = False

    def interrupted_register(*args, **kwargs):
        nonlocal called_once
        if called_once:
            raise BaseException("something went wrong")
        else:
            called_once = True
            return FragmentManager._register_object(*args, **kwargs)

    with patch(
            "splitgraph.core.fragment_manager.FragmentManager._register_object",
            side_effect=interrupted_register,
    ) as ro:
        with pytest.raises(BaseException):
            runner.invoke(cli, ["commit", OUTPUT.to_schema()])

    # Check that no image/object metadata was written
    assert len(OUTPUT.images()) == 2
    assert len(OUTPUT.objects.get_all_objects()) == 3

    assert ro.call_count == 2

    # Check that the data in the audit trigger wasn't deleted
    assert len(
        OUTPUT.engine.get_pending_changes(OUTPUT.to_schema(),
                                          table="test")) == 6
Esempio n. 11
0
def test_import_multiple_tables(pg_repo_local):
    OUTPUT.init()
    head = OUTPUT.head
    OUTPUT.import_tables(tables=[],
                         source_repository=pg_repo_local,
                         source_tables=[])

    for table_name in ["fruits", "vegetables"]:
        assert OUTPUT.run_sql("SELECT * FROM %s" %
                              table_name) == pg_repo_local.run_sql(
                                  "SELECT * FROM %s" % table_name)

    new_head = OUTPUT.head
    assert new_head != head
    assert new_head.parent_id == head.image_hash
Esempio n. 12
0
def test_import_preserves_pending_changes(pg_repo_local):
    OUTPUT.init()
    OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar);
            INSERT INTO test VALUES (1, 'test')""")
    head = OUTPUT.commit()
    OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')")
    changes = get_engine().get_pending_changes(OUTPUT.to_schema(), "test")

    OUTPUT.import_tables(tables=["imported_fruits"],
                         source_repository=pg_repo_local,
                         source_tables=["fruits"])

    assert OUTPUT.head.parent_id == head.image_hash
    assert changes == OUTPUT.engine.get_pending_changes(
        OUTPUT.to_schema(), "test")
Esempio n. 13
0
def test_import_basic(pg_repo_local):
    # Create a new schema and import 'fruits' from the mounted PG table.
    OUTPUT.init()
    head = OUTPUT.head

    OUTPUT.import_tables(tables=["imported_fruits"],
                         source_repository=pg_repo_local,
                         source_tables=["fruits"])

    assert OUTPUT.run_sql("SELECT * FROM imported_fruits"
                          ) == pg_repo_local.run_sql("SELECT * FROM fruits")
    new_head = OUTPUT.head

    assert new_head != head
    assert new_head.parent_id == head.image_hash
Esempio n. 14
0
def test_import_query_reuses_hash(pg_repo_local):
    OUTPUT.init()
    base = OUTPUT.head
    # Run two imports: one importing all rows from `fruits` (will reuse the original `fruits` object),
    # one importing just the first row (new hash, won't be reused).
    ih_v1 = OUTPUT.import_tables(
        source_repository=pg_repo_local,
        source_tables=[
            "SELECT * FROM fruits", "SELECT * FROM fruits WHERE fruit_id = 1"
        ],
        tables=["fruits_all", "fruits_one"],
        do_checkout=False,
        table_queries=[True, True],
    )
    v1 = OUTPUT.images.by_hash(ih_v1)
    assert v1.get_table("fruits_all").objects == pg_repo_local.head.get_table(
        "fruits").objects
    assert (len(OUTPUT.objects.get_all_objects()) == 3
            )  # Original fruits and vegetables + the 1-row import

    # Run the same set of imports again: this time both query results already exist and will be reused.
    base.checkout()
    ih_v2 = OUTPUT.import_tables(
        source_repository=pg_repo_local,
        source_tables=[
            "SELECT * FROM fruits", "SELECT * FROM fruits WHERE fruit_id = 1"
        ],
        tables=["fruits_all", "fruits_one"],
        do_checkout=False,
        table_queries=[True, True],
    )
    v2 = OUTPUT.images.by_hash(ih_v2)
    assert v2.get_table("fruits_all").objects == v1.get_table(
        "fruits_all").objects
    assert v2.get_table("fruits_one").objects == v1.get_table(
        "fruits_one").objects
    assert len(OUTPUT.objects.get_all_objects()
               ) == 3  # No new objects have been created.
Esempio n. 15
0
def test_range_index_ordering_collation(local_engine_empty):
    # Test that range index gets min/max values of text columns using the "C" collation
    # (sort by byte values of characters) rather than anything else (e.g. in en-US
    # "a" comes before "B" even though "B" has a smaller ASCII code).

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key_1 INTEGER, key_2 VARCHAR,"
        " value_1 VARCHAR, value_2 INTEGER, PRIMARY KEY (key_1, key_2))")

    OUTPUT.engine.run_sql_batch(
        "INSERT INTO test VALUES (%s, %s, %s, %s)",
        [
            (1, "ONE", "apple", 4),
            (1, "one", "ORANGE", 3),
            (2, "two", "banana", 2),
            (2, "TWO", "CUCUMBER", 1),
        ],
        schema=OUTPUT.to_schema(),
    )

    head = OUTPUT.commit()
    object_id = head.get_table("test").objects[0]

    assert OUTPUT.objects.get_object_meta([object_id
                                           ])[object_id].object_index == {
                                               "range": {
                                                   "$pk": [[1, "ONE"],
                                                           [2, "two"]],
                                                   "key_1": [1, 2],
                                                   "key_2": ["ONE", "two"],
                                                   "value_1":
                                                   ["CUCUMBER", "banana"],
                                                   "value_2": [1, 4],
                                               }
                                           }
Esempio n. 16
0
def test_bloom_reindex_changed_table(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    for i in range(26):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    OUTPUT.commit(chunk_size=13)
    OUTPUT.run_sql("DELETE FROM test WHERE key = 5")  # ('e', 8)

    head = OUTPUT.commit()
    objects = head.get_table("test").objects
    assert len(objects) == 3  # original 2 fragments and one overwrite

    index_spec = {
        "bloom": {
            "value_1": {
                "probability": 0.01
            },
            "value_2": {
                "probability": 0.01
            }
        }
    }
    # Since the patch object deletes the old value, we don't immediately know what it was and so
    # can't reindex that object.
    with pytest.raises(ObjectIndexingError) as e:
        head.get_table("test").reindex(extra_indexes=index_spec)
    assert "1 object" in str(e.value)

    reindexed = head.get_table("test").reindex(extra_indexes=index_spec,
                                               raise_on_patch_objects=False)
    assert objects[0] in reindexed
    assert objects[1] in reindexed
    assert objects[2] not in reindexed
Esempio n. 17
0
def test_diff_fragment_hashing_long_chain(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key TIMESTAMP PRIMARY KEY, val1 INTEGER, val2 VARCHAR, val3 NUMERIC)"
    )
    OUTPUT.run_sql(
        "INSERT INTO TEST VALUES ('2019-01-01 01:01:01.111', 1, 'one', 1.1),"
        "('2019-01-02 02:02:02.222', 2, 'two', 2.2),"
        "('2019-01-03 03:03:03.333', 3, 'three', 3.3),"
        "('2019-01-04 04:04:04.444', 4, 'four', 4.4)")
    OUTPUT.commit()
    base = OUTPUT.head.get_table("test")

    OUTPUT.run_sql("DELETE FROM test WHERE key = '2019-01-03 03:03:03.333';"
                   "INSERT INTO test VALUES ('2019-01-05', 5, 'five', 5.5)")
    OUTPUT.commit()
    v1 = OUTPUT.head.get_table("test")

    OUTPUT.run_sql(
        "UPDATE test SET val2 = 'UPDATED', val1 = 42 WHERE key = '2019-01-02 02:02:02.222'"
    )
    OUTPUT.commit()
    v2 = OUTPUT.head.get_table("test")

    OUTPUT.run_sql(
        "UPDATE test SET val2 = 'UPDATED AGAIN', val1 = 43 WHERE key = '2019-01-02 02:02:02.222'"
    )
    OUTPUT.commit()
    v3 = OUTPUT.head.get_table("test")

    om = OUTPUT.objects
    final_hash, rows_inserted = OUTPUT.objects.calculate_content_hash(
        OUTPUT.to_schema(), "test")
    assert rows_inserted == 4

    schema_hash = om._calculate_schema_hash(base.table_schema)

    # Check that the final hash can be assembled out of intermediate objects' insertion and deletion hashes

    ins_hash_base, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, base.objects[0])
    assert ("o" + sha256(
        (ins_hash_base.hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == base.objects[-1])
    assert rows_inserted == 4

    ins_hash_v1, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, v1.objects[-1])
    assert rows_inserted == 1

    # timestamp cast to text in a tuple is wrapped with double quotes in PG.
    # As long as hashing is consistent (this happens with all engines no matter what their conventions are),
    # we don't really mind but this might cause some weird issues later with verifying hashes/deduplication.
    del_hash_v1 = Digest.from_hex(
        sha256('("2019-01-03 03:03:03.333",3,three,3.3)'.encode(
            "ascii")).hexdigest())
    assert del_hash_v1.hex(
    ) == "b12a93d54ba7ff1c2e26c92f01ac9c9d7716242eb47344d57c89b481227f5298"

    # Check that the object metadata contains the same hashes.
    v1_meta = om.get_object_meta(v1.objects)[v1.objects[-1]]
    assert ins_hash_v1.hex() == v1_meta.insertion_hash
    assert del_hash_v1.hex() == v1_meta.deletion_hash

    assert ("o" + sha256(
        ((ins_hash_v1 - del_hash_v1).hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == v1.objects[-1])

    ins_hash_v2, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, v2.objects[-1])
    del_hash_v2 = Digest.from_hex(
        sha256('("2019-01-02 02:02:02.222",2,two,2.2)'.encode(
            "ascii")).hexdigest())
    assert del_hash_v2.hex(
    ) == "88e01be43523057d192b2fd65e69f651a9515b7e30d17a9fb852926b71e3bdff"
    assert ins_hash_v2.hex() == om.get_object_meta(
        v2.objects)[v2.objects[-1]].insertion_hash
    assert ("o" + sha256(
        ((ins_hash_v2 - del_hash_v2).hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == v2.objects[-1])
    assert rows_inserted == 1

    v2_meta = om.get_object_meta(v2.objects)[v2.objects[-1]]
    assert ins_hash_v2.hex() == v2_meta.insertion_hash
    assert del_hash_v2.hex() == v2_meta.deletion_hash

    ins_hash_v3, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, v3.objects[-1])
    del_hash_v3 = Digest.from_hex(
        sha256('("2019-01-02 02:02:02.222",42,UPDATED,2.2)'.encode(
            "ascii")).hexdigest())
    assert ("o" + sha256(
        ((ins_hash_v3 - del_hash_v3).hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == v3.objects[-1])

    v3_meta = om.get_object_meta(v3.objects)[v3.objects[-1]]
    assert ins_hash_v3.hex() == v3_meta.insertion_hash
    assert del_hash_v3.hex() == v3_meta.deletion_hash
    assert rows_inserted == 1

    assert (ins_hash_base + ins_hash_v1 + ins_hash_v2 + ins_hash_v3 -
            del_hash_v1 - del_hash_v2 - del_hash_v3).hex() == final_hash
Esempio n. 18
0
def test_bloom_index_querying(local_engine_empty):
    # Same dataset as the previous, but this time test querying the bloom index
    # by calling it directly (not as part of an LQ).

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    for i in range(26):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))

    # Make 3 chunks (value_1 = a-i, j-r, s-z, value_2 = 0-16, 18-34, 36-50).
    # These technically will get caught by the range index but we're not touching
    # it here.
    head = OUTPUT.commit(
        chunk_size=9,
        extra_indexes={
            "test": {
                "bloom": {
                    "value_1": {
                        "probability": 0.01
                    },
                    "value_2": {
                        "probability": 0.01
                    }
                }
            }
        },
    )

    objects = head.get_table("test").objects
    assert len(objects) == 3

    def test_filter(quals, result):
        assert filter_bloom_index(OUTPUT.engine, objects, quals) == result

    # Basic test: check we get only one object matching values from each chunk.
    test_filter([[("value_1", "=", "a")]], [objects[0]])
    test_filter([[("value_1", "=", "k")]], [objects[1]])
    test_filter([[("value_1", "=", "u")]], [objects[2]])

    test_filter([[("value_2", "=", "10")]], [objects[0]])
    test_filter([[("value_2", "=", "20")]], [objects[1]])
    test_filter([[("value_2", "=", "40")]], [objects[2]])

    # This is fun: 37 isn't in the original table (it's only even numbers)
    # but gets caught here as a false positive.
    test_filter([[("value_2", "=", "37")]], [objects[2]])
    # 39 doesn't though.
    test_filter([[("value_2", "=", "39")]], [])

    # Check ORs on same column -- returns two fragments
    test_filter([[("value_1", "=", "b"), ("value_1", "=", "l")]],
                [objects[0], objects[1]])

    # Check ORs on different columns
    test_filter([[("value_1", "=", "b"), ("value_2", "=", "38")]],
                [objects[0], objects[2]])

    test_filter([[("value_1", "=", "b"), ("value_2", "=", "39")]],
                [objects[0]])

    # Check AND
    test_filter([[("value_1", "=", "c")], [("value_2", "=", "40")]], [])

    test_filter([[("value_1", "=", "x")], [("value_2", "=", "40")]],
                [objects[2]])

    # Check AND with an unsupported operator -- gets discarded
    test_filter([[("value_1", "=", "x")], [("value_2", ">", "32")]],
                [objects[2]])

    # OR with unsupported: unsupported evaluates to True, so we have to fetch all objects.
    test_filter([[("value_1", "=", "not_here"), ("value_2", ">", "32")]],
                objects)

    # Test a composite operator : ((False OR True) AND (True OR (unsupported -> True)))
    # First OR-block is only true for objects[1], second block is true for all objects
    # but it gets intersected, so the result is objects [1]
    test_filter(
        [
            [("value_1", "=", "not here"), ("value_2", "=", "32")],
            [("value_1", "=", "k"), ("value_2", ">", "100")],
        ],
        [objects[1]],
    )
Esempio n. 19
0
def _prepare_object_filtering_dataset(include_bloom=False):
    OUTPUT.init()
    OUTPUT.run_sql("""CREATE TABLE test
            (col1 int primary key,
             col2 int,
             col3 varchar,
             col4 timestamp,
             col5 json)""")

    bloom_params = ({
        "test": {
            "bloom": {
                col: {
                    "probability": 0.001
                }
                for col in ["col2", "col3", "col4", "col5"]
            }
        }
    } if include_bloom else None)

    # First object is kind of normal: incrementing PK, a random col2, some text, some timestamps
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (1, 5, 'aaaa', '2016-01-01 00:00:00', '{\"a\": 5}')"
    )
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (5, 3, 'bbbb', '2016-01-02 00:00:00', '{\"a\": 10}')"
    )
    OUTPUT.commit(extra_indexes=bloom_params)
    # Grab the newly created object (at the end of the table's objects list)
    obj_1 = OUTPUT.head.get_table("test").objects[-1]
    # Sanity check on index for reference + easier debugging
    assert OUTPUT.objects.get_object_meta(
        [obj_1])[obj_1].object_index["range"] == {
            "col1": [1, 5],
            "col2": [3, 5],
            "col3": ["aaaa", "bbbb"],
            "col4": ["2016-01-01 00:00:00", "2016-01-02 00:00:00"],
        }
    if include_bloom:
        assert OUTPUT.objects.get_object_meta(
            [obj_1])[obj_1].object_index["bloom"] == {
                "col2": [12, mock.ANY],
                "col3": [12, mock.ANY],
                "col4": [12, mock.ANY],
                "col5": [12, mock.ANY],
            }

    # Second object: PK increments, ranges for col2 and col3 overlap, col4 has the same timestamps everywhere
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (6, 1, 'abbb', '2015-12-30 00:00:00', '{\"a\": 5}')"
    )
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (10, 4, 'cccc', '2015-12-30 00:00:00', '{\"a\": 10}')"
    )
    OUTPUT.commit(extra_indexes=bloom_params)
    obj_2 = OUTPUT.head.get_table("test").objects[-1]
    assert OUTPUT.objects.get_object_meta(
        [obj_2])[obj_2].object_index["range"] == {
            "col1": [6, 10],
            "col2": [1, 4],
            "col3": ["abbb", "cccc"],
            "col4": ["2015-12-30 00:00:00", "2015-12-30 00:00:00"],
        }
    if include_bloom:
        assert OUTPUT.objects.get_object_meta(
            [obj_2])[obj_2].object_index["bloom"] == {
                "col2": [12, mock.ANY],
                "col3": [12, mock.ANY],
                "col4": [12, mock.ANY],
                "col5": [12, mock.ANY],
            }

    # Third object: just a single row
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (11, 10, 'dddd', '2016-01-05 00:00:00', '{\"a\": 5}')"
    )
    OUTPUT.commit(extra_indexes=bloom_params)
    obj_3 = OUTPUT.head.get_table("test").objects[-1]
    assert OUTPUT.objects.get_object_meta(
        [obj_3])[obj_3].object_index["range"] == {
            "col1": [11, 11],
            "col2": [10, 10],
            "col3": ["dddd", "dddd"],
            "col4": ["2016-01-05 00:00:00", "2016-01-05 00:00:00"],
        }
    if include_bloom:
        assert OUTPUT.objects.get_object_meta(
            [obj_3])[obj_3].object_index["bloom"] == {
                "col2": [12, mock.ANY],
                "col3": [12, mock.ANY],
                "col4": [12, mock.ANY],
                "col5": [12, mock.ANY],
            }

    # Fourth object: PK increments, ranges for col2/col3 don't overlap, col4 spans obj_1's range, we have a NULL.
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (12, 11, 'eeee', '2015-12-31 00:00:00', '{\"a\": 5}')"
    )
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (14, 13, 'ezzz', NULL, '{\"a\": 5}')")
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (16, 15, 'ffff', '2016-01-04 00:00:00', '{\"a\": 10}')"
    )
    OUTPUT.commit(extra_indexes=bloom_params)
    obj_4 = OUTPUT.head.get_table("test").objects[-1]
    assert OUTPUT.objects.get_object_meta(
        [obj_4])[obj_4].object_index["range"] == {
            "col1": [12, 16],
            "col2": [11, 15],
            "col3": ["eeee", "ffff"],
            "col4": ["2015-12-31 00:00:00", "2016-01-04 00:00:00"],
        }
    if include_bloom:
        assert OUTPUT.objects.get_object_meta(
            [obj_4])[obj_4].object_index["bloom"] == {
                "col2": [12, mock.ANY],
                "col3": [12, mock.ANY],
                "col4": [12, mock.ANY],
                "col5": [12, mock.ANY],
            }

    return [obj_1, obj_2, obj_3, obj_4]
Esempio n. 20
0
def test_bloom_index_structure(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    # Insert 26 rows with value_1 spanning a-z
    for i in range(26):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    head = OUTPUT.commit(
        extra_indexes={"test": {
            "bloom": {
                "value_1": {
                    "size": 16
                }
            }
        }})

    objects = head.get_table("test").objects
    object_meta = OUTPUT.objects.get_object_meta(objects)

    index = object_meta[objects[0]].object_index
    assert "bloom" in index

    # The bloom index used k=4, note that the formula for optimal k is
    # m(ln2)/n = 16 * 8 (filter size in bits) * 0.693 / 26 (distinct items)
    # = 3.41 which rounds up to 4

    # The actual base64 value here is a canary: if the index fingerprint for some reason changes
    # with the same data, this is highly suspicious.
    assert index["bloom"] == {"value_1": [4, "T79jcHurra5T6d8Hk+djZA=="]}

    # Test indexing two columns with different parameters
    # Delete one row so that the object doesn't get reused and the index gets written again.
    OUTPUT.run_sql("DELETE FROM test WHERE key = 26")
    head = OUTPUT.commit(
        snap_only=True,
        extra_indexes={
            "test": {
                "bloom": {
                    "value_1": {
                        "size": 16
                    },
                    "value_2": {
                        "probability": 0.01
                    }
                }
            }
        },
    )

    objects = head.get_table("test").objects
    object_meta = OUTPUT.objects.get_object_meta(objects)

    index = object_meta[objects[0]].object_index
    assert "bloom" in index

    # For fixed probability, we have k = -log2(p) = 6.64 (up to 7)
    # and filter size = -n * ln(p) / ln(2)**2 = 249.211 bits rounded up to 30 bytes
    # which in base64 is represented with ceil(30/3*4) = 40 bytes.
    assert index["bloom"] == {
        "value_1": [4, "D79jcGurra5T6d8Hk+djZA=="],
        "value_2": [7, "uSP6qzHHDqVq/qHMlqrAoHhpxEuZ08McrB0J6c9M"],
    }
    assert len(index["bloom"]["value_2"][1]) == 40