def test_bloom_index_datetime(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)") for i in range(50): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)", (i + 1, dt(2015, 1, 1) + timedelta(days=i))) head = OUTPUT.commit( extra_indexes={"test": { "bloom": { "value_1": { "probability": 0.01 } } }}) objects = head.get_table("test").objects # Datetimes are supported in the bloom index if they're passed in as actual datetime # objects (which Multicorn does). # Spot check some dates that exist... for i in range(0, 50, 5): assert (filter_bloom_index(OUTPUT.engine, objects, [[ ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i)) ]]) == objects) # ...and some that don't. assert (filter_bloom_index( OUTPUT.engine, objects, [[("value_1", "=", dt(2015, 1, 1) + timedelta(days=55))]]) == []) # They also work if passed in as ISO strings with space as a separator. assert (filter_bloom_index( OUTPUT.engine, objects, [[("value_1", "=", "2015-01-01 00:00:00")]]) == objects)
def test_bloom_index_post_factum(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)") for i in range(50): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)", (i + 1, dt(2015, 1, 1) + timedelta(days=i))) head = OUTPUT.commit() # Reindex the table after committing it and check that object metadata gets overwritten. head.get_table("test").reindex( extra_indexes={"bloom": { "value_1": { "probability": 0.01 } }}) obj = head.get_table("test").objects[0] object_index = OUTPUT.objects.get_object_meta([obj])[obj].object_index assert "bloom" in object_index # Run a snippet from the previous test to check the bloom metadata is valid. objects = head.get_table("test").objects for i in range(0, 50, 5): assert (filter_bloom_index(OUTPUT.engine, objects, [[ ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i)) ]]) == objects)
def _setup_dataset(): OUTPUT.init() OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar); INSERT INTO test VALUES (1, 'test')""") OUTPUT.commit() OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')") return OUTPUT.commit()
def test_base_fragment_reused_chunking(local_engine_empty): # Check that if we split a table into chunks and some chunks are the same, they get assigned to the same objects. OUTPUT.init() base = OUTPUT.head _make_test_table(OUTPUT) OUTPUT.commit(chunk_size=5) table_1 = OUTPUT.head.get_table("test") # Table 1 produced 3 objects assert len(OUTPUT.objects.get_all_objects()) == 3 # All chunks are the same base.checkout() _make_test_table(OUTPUT) OUTPUT.commit(chunk_size=5) table_2 = OUTPUT.head.get_table("test") assert len(OUTPUT.objects.get_all_objects()) == 3 assert table_1.objects == table_2.objects # Insert something else into the middle chunk so that it's different. This will get conflated so won't get recorded # as an update. base.checkout() _make_test_table(OUTPUT) OUTPUT.run_sql( "UPDATE test SET value_1 = 'UPDATED', value_2 = 42 WHERE key = 7") OUTPUT.commit(chunk_size=5) table_3 = OUTPUT.head.get_table("test") assert len(OUTPUT.objects.get_all_objects()) == 4 # Table 3 reused the first and the last object but created a new one for the middle fragment. assert len(table_3.objects) == 3 assert table_3.objects[0] == table_1.objects[0] assert table_3.objects[1] != table_1.objects[1] assert table_3.objects[2] == table_1.objects[2]
def test_layered_querying_json_arrays(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, " "value JSONB, arr_value INTEGER[], arr_2d_value TEXT[][])" ) OUTPUT.run_sql( "INSERT INTO test VALUES (1, %s, %s, %s)", (json.dumps({"a": 1, "b": 2.5}), [1, 2, 3], [["one", "two"], ["three", "four"]]), ) OUTPUT.commit() OUTPUT.run_sql( "INSERT INTO test VALUES (2, %s, %s, %s)", ( json.dumps({"a": "one", "b": "two point five"}), [4, 5, 6], [["five", "six"], ["seven", "eight"]], ), ) head = OUTPUT.commit() OUTPUT.uncheckout() head.checkout(layered=True) assert OUTPUT.run_sql("SELECT * FROM test ORDER BY key") == [ (1, {"a": 1, "b": 2.5}, [1, 2, 3], [["one", "two"], ["three", "four"]]), (2, {"a": "one", "b": "two point five"}, [4, 5, 6], [["five", "six"], ["seven", "eight"]]), ]
def test_import_and_update(local_engine_empty, unprivileged_pg_repo): OUTPUT.init() head = OUTPUT.head remote_head = unprivileged_pg_repo.images["latest"] # Import the 'fruits' table from the origin. import_table_from_remote(unprivileged_pg_repo, ["fruits"], remote_head.image_hash, OUTPUT, target_tables=[]) new_head = OUTPUT.head OUTPUT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") new_head_2 = OUTPUT.commit() head.checkout() assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") new_head.checkout() assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")] new_head_2.checkout() assert OUTPUT.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]
def test_import_bare(pg_repo_local): # Check import without checking anything out, just by manipulating metadata and running LQs against # source images. # Create a new schema and import 'fruits' OUTPUT.init() # Make sure the existing table is preserved. OUTPUT.run_sql("CREATE TABLE sentinel (key INTEGER)") OUTPUT.commit() pg_repo_local.uncheckout() OUTPUT.uncheckout() OUTPUT.import_tables( tables=["imported_fruits"], source_repository=pg_repo_local, image_hash=pg_repo_local.images["latest"].image_hash, source_tables=["SELECT * FROM fruits WHERE fruit_id = 1"], parent_hash=OUTPUT.images["latest"].image_hash, do_checkout=False, table_queries=[True], ) assert OUTPUT.head is None assert pg_repo_local.head is None assert sorted(OUTPUT.images["latest"].get_tables()) == [ "imported_fruits", "sentinel" ] assert list(OUTPUT.images["latest"].get_table("imported_fruits").query( columns=["name"], quals=[])) == [{ "name": "apple" }]
def test_commandline_show_empty_image(local_engine_empty): # Check size calculations etc in an empty image don't cause errors. runner = CliRunner() OUTPUT.init() assert OUTPUT.images["latest"].get_size() == 0 result = runner.invoke(show_c, [str(OUTPUT) + ":" + "000000000000"], catch_exceptions=False) assert "Size: 0.00 B" in result.output
def test_bloom_index_deletions(local_engine_empty): # Check the bloom index fingerprint includes both the old and the new values of deleted/added cells. OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) # Insert 26 rows with value_1 spanning a-z for i in range(26): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) OUTPUT.commit() # Delete and update some rows OUTPUT.run_sql("DELETE FROM test WHERE key = 5") # ('e', 8) OUTPUT.run_sql("DELETE FROM test WHERE key = 10") # ('j', 18) OUTPUT.run_sql("DELETE FROM test WHERE key = 15") # ('o', 28) OUTPUT.run_sql("UPDATE test SET value_1 = 'G' WHERE key = 7") # (g -> G) OUTPUT.run_sql("UPDATE test SET value_2 = 23 WHERE key = 12") # (22 -> 23) head = OUTPUT.commit( extra_indexes={ "test": { "bloom": { "value_1": { "probability": 0.01 }, "value_2": { "probability": 0.01 } } } }) objects = head.get_table("test").objects # Sanity check: 2 objects (original data + new with 3 deletions and 2 upserts) assert len(objects) == 2 assert (local_engine_empty.run_sql( "SELECT COUNT(*) FROM splitgraph_meta." + objects[1], return_shape=ResultShape.ONE_ONE) == 5) # Check old/new values for value_1: 3 old values before a deletion, # 1 old value before update, 1 updated value value_1_vals = ["e", "j", "o", "g", "G"] # value_2, same value_2_vals = [8, 18, 28, 22, 23] for val in value_1_vals: assert filter_bloom_index(OUTPUT.engine, objects, [[("value_1", "=", val)]]) == objects for val in value_2_vals: assert filter_bloom_index(OUTPUT.engine, objects, [[("value_2", "=", val)]]) == objects
def test_rollback_on_error(local_engine_empty): # For e.g. commit/checkout/other commands, we don't do commits/rollbacks # in the library itself and expect the caller to manage transactions. In CLI, # we need to make sure that erroneous transactions (e.g. interrupted SG commits) # are rolled back correctly instead of being committed. runner = CliRunner() OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) for i in range(11): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) head = OUTPUT.commit(chunk_size=5, in_fragment_order={"test": ["key", "value_1"]}) assert len(OUTPUT.images()) == 2 assert len(OUTPUT.objects.get_all_objects()) == 3 _alter_diff_splitting_dataset() OUTPUT.commit_engines() # Simulate the commit getting interrupted by the first object going through and being # recorded, then a KeyboardInterrupt being raised. called_once = False def interrupted_register(*args, **kwargs): nonlocal called_once if called_once: raise BaseException("something went wrong") else: called_once = True return FragmentManager._register_object(*args, **kwargs) with patch( "splitgraph.core.fragment_manager.FragmentManager._register_object", side_effect=interrupted_register, ) as ro: with pytest.raises(BaseException): runner.invoke(cli, ["commit", OUTPUT.to_schema()]) # Check that no image/object metadata was written assert len(OUTPUT.images()) == 2 assert len(OUTPUT.objects.get_all_objects()) == 3 assert ro.call_count == 2 # Check that the data in the audit trigger wasn't deleted assert len( OUTPUT.engine.get_pending_changes(OUTPUT.to_schema(), table="test")) == 6
def test_import_multiple_tables(pg_repo_local): OUTPUT.init() head = OUTPUT.head OUTPUT.import_tables(tables=[], source_repository=pg_repo_local, source_tables=[]) for table_name in ["fruits", "vegetables"]: assert OUTPUT.run_sql("SELECT * FROM %s" % table_name) == pg_repo_local.run_sql( "SELECT * FROM %s" % table_name) new_head = OUTPUT.head assert new_head != head assert new_head.parent_id == head.image_hash
def test_import_preserves_pending_changes(pg_repo_local): OUTPUT.init() OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar); INSERT INTO test VALUES (1, 'test')""") head = OUTPUT.commit() OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')") changes = get_engine().get_pending_changes(OUTPUT.to_schema(), "test") OUTPUT.import_tables(tables=["imported_fruits"], source_repository=pg_repo_local, source_tables=["fruits"]) assert OUTPUT.head.parent_id == head.image_hash assert changes == OUTPUT.engine.get_pending_changes( OUTPUT.to_schema(), "test")
def test_import_basic(pg_repo_local): # Create a new schema and import 'fruits' from the mounted PG table. OUTPUT.init() head = OUTPUT.head OUTPUT.import_tables(tables=["imported_fruits"], source_repository=pg_repo_local, source_tables=["fruits"]) assert OUTPUT.run_sql("SELECT * FROM imported_fruits" ) == pg_repo_local.run_sql("SELECT * FROM fruits") new_head = OUTPUT.head assert new_head != head assert new_head.parent_id == head.image_hash
def test_import_query_reuses_hash(pg_repo_local): OUTPUT.init() base = OUTPUT.head # Run two imports: one importing all rows from `fruits` (will reuse the original `fruits` object), # one importing just the first row (new hash, won't be reused). ih_v1 = OUTPUT.import_tables( source_repository=pg_repo_local, source_tables=[ "SELECT * FROM fruits", "SELECT * FROM fruits WHERE fruit_id = 1" ], tables=["fruits_all", "fruits_one"], do_checkout=False, table_queries=[True, True], ) v1 = OUTPUT.images.by_hash(ih_v1) assert v1.get_table("fruits_all").objects == pg_repo_local.head.get_table( "fruits").objects assert (len(OUTPUT.objects.get_all_objects()) == 3 ) # Original fruits and vegetables + the 1-row import # Run the same set of imports again: this time both query results already exist and will be reused. base.checkout() ih_v2 = OUTPUT.import_tables( source_repository=pg_repo_local, source_tables=[ "SELECT * FROM fruits", "SELECT * FROM fruits WHERE fruit_id = 1" ], tables=["fruits_all", "fruits_one"], do_checkout=False, table_queries=[True, True], ) v2 = OUTPUT.images.by_hash(ih_v2) assert v2.get_table("fruits_all").objects == v1.get_table( "fruits_all").objects assert v2.get_table("fruits_one").objects == v1.get_table( "fruits_one").objects assert len(OUTPUT.objects.get_all_objects() ) == 3 # No new objects have been created.
def test_range_index_ordering_collation(local_engine_empty): # Test that range index gets min/max values of text columns using the "C" collation # (sort by byte values of characters) rather than anything else (e.g. in en-US # "a" comes before "B" even though "B" has a smaller ASCII code). OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key_1 INTEGER, key_2 VARCHAR," " value_1 VARCHAR, value_2 INTEGER, PRIMARY KEY (key_1, key_2))") OUTPUT.engine.run_sql_batch( "INSERT INTO test VALUES (%s, %s, %s, %s)", [ (1, "ONE", "apple", 4), (1, "one", "ORANGE", 3), (2, "two", "banana", 2), (2, "TWO", "CUCUMBER", 1), ], schema=OUTPUT.to_schema(), ) head = OUTPUT.commit() object_id = head.get_table("test").objects[0] assert OUTPUT.objects.get_object_meta([object_id ])[object_id].object_index == { "range": { "$pk": [[1, "ONE"], [2, "two"]], "key_1": [1, 2], "key_2": ["ONE", "two"], "value_1": ["CUCUMBER", "banana"], "value_2": [1, 4], } }
def test_bloom_reindex_changed_table(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) for i in range(26): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) OUTPUT.commit(chunk_size=13) OUTPUT.run_sql("DELETE FROM test WHERE key = 5") # ('e', 8) head = OUTPUT.commit() objects = head.get_table("test").objects assert len(objects) == 3 # original 2 fragments and one overwrite index_spec = { "bloom": { "value_1": { "probability": 0.01 }, "value_2": { "probability": 0.01 } } } # Since the patch object deletes the old value, we don't immediately know what it was and so # can't reindex that object. with pytest.raises(ObjectIndexingError) as e: head.get_table("test").reindex(extra_indexes=index_spec) assert "1 object" in str(e.value) reindexed = head.get_table("test").reindex(extra_indexes=index_spec, raise_on_patch_objects=False) assert objects[0] in reindexed assert objects[1] in reindexed assert objects[2] not in reindexed
def test_diff_fragment_hashing_long_chain(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key TIMESTAMP PRIMARY KEY, val1 INTEGER, val2 VARCHAR, val3 NUMERIC)" ) OUTPUT.run_sql( "INSERT INTO TEST VALUES ('2019-01-01 01:01:01.111', 1, 'one', 1.1)," "('2019-01-02 02:02:02.222', 2, 'two', 2.2)," "('2019-01-03 03:03:03.333', 3, 'three', 3.3)," "('2019-01-04 04:04:04.444', 4, 'four', 4.4)") OUTPUT.commit() base = OUTPUT.head.get_table("test") OUTPUT.run_sql("DELETE FROM test WHERE key = '2019-01-03 03:03:03.333';" "INSERT INTO test VALUES ('2019-01-05', 5, 'five', 5.5)") OUTPUT.commit() v1 = OUTPUT.head.get_table("test") OUTPUT.run_sql( "UPDATE test SET val2 = 'UPDATED', val1 = 42 WHERE key = '2019-01-02 02:02:02.222'" ) OUTPUT.commit() v2 = OUTPUT.head.get_table("test") OUTPUT.run_sql( "UPDATE test SET val2 = 'UPDATED AGAIN', val1 = 43 WHERE key = '2019-01-02 02:02:02.222'" ) OUTPUT.commit() v3 = OUTPUT.head.get_table("test") om = OUTPUT.objects final_hash, rows_inserted = OUTPUT.objects.calculate_content_hash( OUTPUT.to_schema(), "test") assert rows_inserted == 4 schema_hash = om._calculate_schema_hash(base.table_schema) # Check that the final hash can be assembled out of intermediate objects' insertion and deletion hashes ins_hash_base, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, base.objects[0]) assert ("o" + sha256( (ins_hash_base.hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == base.objects[-1]) assert rows_inserted == 4 ins_hash_v1, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, v1.objects[-1]) assert rows_inserted == 1 # timestamp cast to text in a tuple is wrapped with double quotes in PG. # As long as hashing is consistent (this happens with all engines no matter what their conventions are), # we don't really mind but this might cause some weird issues later with verifying hashes/deduplication. del_hash_v1 = Digest.from_hex( sha256('("2019-01-03 03:03:03.333",3,three,3.3)'.encode( "ascii")).hexdigest()) assert del_hash_v1.hex( ) == "b12a93d54ba7ff1c2e26c92f01ac9c9d7716242eb47344d57c89b481227f5298" # Check that the object metadata contains the same hashes. v1_meta = om.get_object_meta(v1.objects)[v1.objects[-1]] assert ins_hash_v1.hex() == v1_meta.insertion_hash assert del_hash_v1.hex() == v1_meta.deletion_hash assert ("o" + sha256( ((ins_hash_v1 - del_hash_v1).hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == v1.objects[-1]) ins_hash_v2, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, v2.objects[-1]) del_hash_v2 = Digest.from_hex( sha256('("2019-01-02 02:02:02.222",2,two,2.2)'.encode( "ascii")).hexdigest()) assert del_hash_v2.hex( ) == "88e01be43523057d192b2fd65e69f651a9515b7e30d17a9fb852926b71e3bdff" assert ins_hash_v2.hex() == om.get_object_meta( v2.objects)[v2.objects[-1]].insertion_hash assert ("o" + sha256( ((ins_hash_v2 - del_hash_v2).hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == v2.objects[-1]) assert rows_inserted == 1 v2_meta = om.get_object_meta(v2.objects)[v2.objects[-1]] assert ins_hash_v2.hex() == v2_meta.insertion_hash assert del_hash_v2.hex() == v2_meta.deletion_hash ins_hash_v3, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, v3.objects[-1]) del_hash_v3 = Digest.from_hex( sha256('("2019-01-02 02:02:02.222",42,UPDATED,2.2)'.encode( "ascii")).hexdigest()) assert ("o" + sha256( ((ins_hash_v3 - del_hash_v3).hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == v3.objects[-1]) v3_meta = om.get_object_meta(v3.objects)[v3.objects[-1]] assert ins_hash_v3.hex() == v3_meta.insertion_hash assert del_hash_v3.hex() == v3_meta.deletion_hash assert rows_inserted == 1 assert (ins_hash_base + ins_hash_v1 + ins_hash_v2 + ins_hash_v3 - del_hash_v1 - del_hash_v2 - del_hash_v3).hex() == final_hash
def test_bloom_index_querying(local_engine_empty): # Same dataset as the previous, but this time test querying the bloom index # by calling it directly (not as part of an LQ). OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) for i in range(26): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) # Make 3 chunks (value_1 = a-i, j-r, s-z, value_2 = 0-16, 18-34, 36-50). # These technically will get caught by the range index but we're not touching # it here. head = OUTPUT.commit( chunk_size=9, extra_indexes={ "test": { "bloom": { "value_1": { "probability": 0.01 }, "value_2": { "probability": 0.01 } } } }, ) objects = head.get_table("test").objects assert len(objects) == 3 def test_filter(quals, result): assert filter_bloom_index(OUTPUT.engine, objects, quals) == result # Basic test: check we get only one object matching values from each chunk. test_filter([[("value_1", "=", "a")]], [objects[0]]) test_filter([[("value_1", "=", "k")]], [objects[1]]) test_filter([[("value_1", "=", "u")]], [objects[2]]) test_filter([[("value_2", "=", "10")]], [objects[0]]) test_filter([[("value_2", "=", "20")]], [objects[1]]) test_filter([[("value_2", "=", "40")]], [objects[2]]) # This is fun: 37 isn't in the original table (it's only even numbers) # but gets caught here as a false positive. test_filter([[("value_2", "=", "37")]], [objects[2]]) # 39 doesn't though. test_filter([[("value_2", "=", "39")]], []) # Check ORs on same column -- returns two fragments test_filter([[("value_1", "=", "b"), ("value_1", "=", "l")]], [objects[0], objects[1]]) # Check ORs on different columns test_filter([[("value_1", "=", "b"), ("value_2", "=", "38")]], [objects[0], objects[2]]) test_filter([[("value_1", "=", "b"), ("value_2", "=", "39")]], [objects[0]]) # Check AND test_filter([[("value_1", "=", "c")], [("value_2", "=", "40")]], []) test_filter([[("value_1", "=", "x")], [("value_2", "=", "40")]], [objects[2]]) # Check AND with an unsupported operator -- gets discarded test_filter([[("value_1", "=", "x")], [("value_2", ">", "32")]], [objects[2]]) # OR with unsupported: unsupported evaluates to True, so we have to fetch all objects. test_filter([[("value_1", "=", "not_here"), ("value_2", ">", "32")]], objects) # Test a composite operator : ((False OR True) AND (True OR (unsupported -> True))) # First OR-block is only true for objects[1], second block is true for all objects # but it gets intersected, so the result is objects [1] test_filter( [ [("value_1", "=", "not here"), ("value_2", "=", "32")], [("value_1", "=", "k"), ("value_2", ">", "100")], ], [objects[1]], )
def _prepare_object_filtering_dataset(include_bloom=False): OUTPUT.init() OUTPUT.run_sql("""CREATE TABLE test (col1 int primary key, col2 int, col3 varchar, col4 timestamp, col5 json)""") bloom_params = ({ "test": { "bloom": { col: { "probability": 0.001 } for col in ["col2", "col3", "col4", "col5"] } } } if include_bloom else None) # First object is kind of normal: incrementing PK, a random col2, some text, some timestamps OUTPUT.run_sql( "INSERT INTO test VALUES (1, 5, 'aaaa', '2016-01-01 00:00:00', '{\"a\": 5}')" ) OUTPUT.run_sql( "INSERT INTO test VALUES (5, 3, 'bbbb', '2016-01-02 00:00:00', '{\"a\": 10}')" ) OUTPUT.commit(extra_indexes=bloom_params) # Grab the newly created object (at the end of the table's objects list) obj_1 = OUTPUT.head.get_table("test").objects[-1] # Sanity check on index for reference + easier debugging assert OUTPUT.objects.get_object_meta( [obj_1])[obj_1].object_index["range"] == { "col1": [1, 5], "col2": [3, 5], "col3": ["aaaa", "bbbb"], "col4": ["2016-01-01 00:00:00", "2016-01-02 00:00:00"], } if include_bloom: assert OUTPUT.objects.get_object_meta( [obj_1])[obj_1].object_index["bloom"] == { "col2": [12, mock.ANY], "col3": [12, mock.ANY], "col4": [12, mock.ANY], "col5": [12, mock.ANY], } # Second object: PK increments, ranges for col2 and col3 overlap, col4 has the same timestamps everywhere OUTPUT.run_sql( "INSERT INTO test VALUES (6, 1, 'abbb', '2015-12-30 00:00:00', '{\"a\": 5}')" ) OUTPUT.run_sql( "INSERT INTO test VALUES (10, 4, 'cccc', '2015-12-30 00:00:00', '{\"a\": 10}')" ) OUTPUT.commit(extra_indexes=bloom_params) obj_2 = OUTPUT.head.get_table("test").objects[-1] assert OUTPUT.objects.get_object_meta( [obj_2])[obj_2].object_index["range"] == { "col1": [6, 10], "col2": [1, 4], "col3": ["abbb", "cccc"], "col4": ["2015-12-30 00:00:00", "2015-12-30 00:00:00"], } if include_bloom: assert OUTPUT.objects.get_object_meta( [obj_2])[obj_2].object_index["bloom"] == { "col2": [12, mock.ANY], "col3": [12, mock.ANY], "col4": [12, mock.ANY], "col5": [12, mock.ANY], } # Third object: just a single row OUTPUT.run_sql( "INSERT INTO test VALUES (11, 10, 'dddd', '2016-01-05 00:00:00', '{\"a\": 5}')" ) OUTPUT.commit(extra_indexes=bloom_params) obj_3 = OUTPUT.head.get_table("test").objects[-1] assert OUTPUT.objects.get_object_meta( [obj_3])[obj_3].object_index["range"] == { "col1": [11, 11], "col2": [10, 10], "col3": ["dddd", "dddd"], "col4": ["2016-01-05 00:00:00", "2016-01-05 00:00:00"], } if include_bloom: assert OUTPUT.objects.get_object_meta( [obj_3])[obj_3].object_index["bloom"] == { "col2": [12, mock.ANY], "col3": [12, mock.ANY], "col4": [12, mock.ANY], "col5": [12, mock.ANY], } # Fourth object: PK increments, ranges for col2/col3 don't overlap, col4 spans obj_1's range, we have a NULL. OUTPUT.run_sql( "INSERT INTO test VALUES (12, 11, 'eeee', '2015-12-31 00:00:00', '{\"a\": 5}')" ) OUTPUT.run_sql( "INSERT INTO test VALUES (14, 13, 'ezzz', NULL, '{\"a\": 5}')") OUTPUT.run_sql( "INSERT INTO test VALUES (16, 15, 'ffff', '2016-01-04 00:00:00', '{\"a\": 10}')" ) OUTPUT.commit(extra_indexes=bloom_params) obj_4 = OUTPUT.head.get_table("test").objects[-1] assert OUTPUT.objects.get_object_meta( [obj_4])[obj_4].object_index["range"] == { "col1": [12, 16], "col2": [11, 15], "col3": ["eeee", "ffff"], "col4": ["2015-12-31 00:00:00", "2016-01-04 00:00:00"], } if include_bloom: assert OUTPUT.objects.get_object_meta( [obj_4])[obj_4].object_index["bloom"] == { "col2": [12, mock.ANY], "col3": [12, mock.ANY], "col4": [12, mock.ANY], "col5": [12, mock.ANY], } return [obj_1, obj_2, obj_3, obj_4]
def test_bloom_index_structure(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) # Insert 26 rows with value_1 spanning a-z for i in range(26): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) head = OUTPUT.commit( extra_indexes={"test": { "bloom": { "value_1": { "size": 16 } } }}) objects = head.get_table("test").objects object_meta = OUTPUT.objects.get_object_meta(objects) index = object_meta[objects[0]].object_index assert "bloom" in index # The bloom index used k=4, note that the formula for optimal k is # m(ln2)/n = 16 * 8 (filter size in bits) * 0.693 / 26 (distinct items) # = 3.41 which rounds up to 4 # The actual base64 value here is a canary: if the index fingerprint for some reason changes # with the same data, this is highly suspicious. assert index["bloom"] == {"value_1": [4, "T79jcHurra5T6d8Hk+djZA=="]} # Test indexing two columns with different parameters # Delete one row so that the object doesn't get reused and the index gets written again. OUTPUT.run_sql("DELETE FROM test WHERE key = 26") head = OUTPUT.commit( snap_only=True, extra_indexes={ "test": { "bloom": { "value_1": { "size": 16 }, "value_2": { "probability": 0.01 } } } }, ) objects = head.get_table("test").objects object_meta = OUTPUT.objects.get_object_meta(objects) index = object_meta[objects[0]].object_index assert "bloom" in index # For fixed probability, we have k = -log2(p) = 6.64 (up to 7) # and filter size = -n * ln(p) / ln(2)**2 = 249.211 bits rounded up to 30 bytes # which in base64 is represented with ceil(30/3*4) = 40 bytes. assert index["bloom"] == { "value_1": [4, "D79jcGurra5T6d8Hk+djZA=="], "value_2": [7, "uSP6qzHHDqVq/qHMlqrAoHhpxEuZ08McrB0J6c9M"], } assert len(index["bloom"]["value_2"][1]) == 40