def test_from_remote(local_engine_empty, pg_repo_remote_multitag): # Test running commands that base new datasets on a remote repository. execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": "v1"}, output=OUTPUT) new_head = OUTPUT.head parent = OUTPUT.images.by_hash(new_head.parent_id) # Go back to the parent: the two source tables should exist there parent.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") new_head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ] # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table) # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which # doesn't exist). OUTPUT.delete() execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": "v2"}, output=OUTPUT) assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange", "carrot")]
def test_local_import_splitfile(pg_repo_local): execute_commands(load_splitfile("import_local.splitfile"), output=OUTPUT) head = OUTPUT.head old_head = head.parent_id OUTPUT.images.by_hash(old_head).checkout() assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
def test_from_remote_hash(local_engine_empty, pg_repo_remote): head = pg_repo_remote.head.image_hash # Test running commands that base new datasets on a remote repository. execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": head[:10]}, output=OUTPUT) assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ]
def test_rollback_on_error(local_engine_empty): # For e.g. commit/checkout/other commands, we don't do commits/rollbacks # in the library itself and expect the caller to manage transactions. In CLI, # we need to make sure that erroneous transactions (e.g. interrupted SG commits) # are rolled back correctly instead of being committed. runner = CliRunner() OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) for i in range(11): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) head = OUTPUT.commit(chunk_size=5, in_fragment_order={"test": ["key", "value_1"]}) assert len(OUTPUT.images()) == 2 assert len(OUTPUT.objects.get_all_objects()) == 3 _alter_diff_splitting_dataset() OUTPUT.commit_engines() # Simulate the commit getting interrupted by the first object going through and being # recorded, then a KeyboardInterrupt being raised. called_once = False def interrupted_register(*args, **kwargs): nonlocal called_once if called_once: raise BaseException("something went wrong") else: called_once = True return FragmentManager._register_object(*args, **kwargs) with patch( "splitgraph.core.fragment_manager.FragmentManager._register_object", side_effect=interrupted_register, ) as ro: with pytest.raises(BaseException): runner.invoke(cli, ["commit", OUTPUT.to_schema()]) # Check that no image/object metadata was written assert len(OUTPUT.images()) == 2 assert len(OUTPUT.objects.get_all_objects()) == 3 assert ro.call_count == 2 # Check that the data in the audit trigger wasn't deleted assert len( OUTPUT.engine.get_pending_changes(OUTPUT.to_schema(), table="test")) == 6
def test_import_preserves_pending_changes(pg_repo_local): OUTPUT.init() OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar); INSERT INTO test VALUES (1, 'test')""") head = OUTPUT.commit() OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')") changes = get_engine().get_pending_changes(OUTPUT.to_schema(), "test") OUTPUT.import_tables(tables=["imported_fruits"], source_repository=pg_repo_local, source_tables=["fruits"]) assert OUTPUT.head.parent_id == head.image_hash assert changes == OUTPUT.engine.get_pending_changes( OUTPUT.to_schema(), "test")
def test_import_preserves_existing_tables(pg_repo_local): # Create a new schema and import 'fruits' from the mounted PG table. head = _setup_dataset() OUTPUT.import_tables(tables=["imported_fruits"], source_repository=pg_repo_local, source_tables=["fruits"]) new_head = OUTPUT.head head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "test") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "imported_fruits") new_head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "test") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "imported_fruits")
def test_import_and_update(local_engine_empty, unprivileged_pg_repo): OUTPUT.init() head = OUTPUT.head remote_head = unprivileged_pg_repo.images["latest"] # Import the 'fruits' table from the origin. import_table_from_remote(unprivileged_pg_repo, ["fruits"], remote_head.image_hash, OUTPUT, target_tables=[]) new_head = OUTPUT.head OUTPUT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") new_head_2 = OUTPUT.commit() head.checkout() assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") new_head.checkout() assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")] new_head_2.checkout() assert OUTPUT.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]
def test_from_local(pg_repo_local): execute_commands(load_splitfile("from_local.splitfile"), output=OUTPUT) new_head = OUTPUT.head # Go back to the parent: the two source tables should exist there OUTPUT.images.by_hash(new_head.parent_id).checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") new_head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ]
def test_advanced_splitfile(pg_repo_local): execute_commands( load_splitfile("import_local_multiple_with_queries.splitfile"), output=OUTPUT) assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") head = OUTPUT.head old_head = head.parent_id OUTPUT.images.by_hash(old_head).checkout() assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") head.checkout() assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (2, "orange", "carrot") ] assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(2, "orange")]
def test_import_from_remote(local_engine_empty, unprivileged_pg_repo): # Start with a clean repo -- add a table to output to see if it's preserved. head = _setup_dataset() local_objects = OUTPUT.objects assert len(local_objects.get_downloaded_objects()) == 2 assert len(local_objects.get_all_objects()) == 2 assert local_engine_empty.get_all_tables(OUTPUT.to_schema()) == ["test"] # Import the 'fruits' table from the origin. remote_head = unprivileged_pg_repo.images["latest"] import_table_from_remote(unprivileged_pg_repo, ["fruits"], remote_head.image_hash, OUTPUT, target_tables=[]) new_head = OUTPUT.head # Check that the table now exists in the output, is committed and there's no trace of the cloned repo. # Also clean up the unused objects to make sure that the newly cloned table is still recorded. assert sorted(local_engine_empty.get_all_tables( OUTPUT.to_schema())) == ["fruits", "test"] local_objects.cleanup() assert len(get_current_repositories(local_engine_empty)) == 1 head.checkout() assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test") assert not local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits") new_head.checkout() assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test") assert local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")]
def test_import_all(local_engine_empty): execute_commands(load_splitfile("import_all_from_mounted.splitfile"), output=OUTPUT) head = OUTPUT.head old_head = OUTPUT.images.by_hash(head.parent_id) old_head.checkout() tables = ["vegetables", "fruits"] contents = [[(1, "potato"), (2, "carrot")], [(1, "apple"), (2, "orange")]] for t in tables: assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), t) head.checkout() for t, c in zip(tables, contents): assert OUTPUT.run_sql("SELECT * FROM %s" % t) == c
def test_import_with_custom_query(pg_repo_local): # Test that importing with a custom query creates a new object pg_repo_local.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise');" "INSERT INTO vegetables VALUES (3, 'oregano')") pg_repo_local.commit() all_current_objects = pg_repo_local.objects.get_all_objects() execute_commands(load_splitfile("import_with_custom_query.splitfile"), output=OUTPUT) head = OUTPUT.head old_head = OUTPUT.images.by_hash(head.parent_id) # First two tables imported as new objects since they had a custom query, the other two get pointed # to the old pg_repo_local objects. tables = ["my_fruits", "o_vegetables", "vegetables", "all_fruits"] contents = [ [(2, "orange")], [(1, "potato"), (3, "oregano")], [(1, "potato"), (2, "carrot"), (3, "oregano")], [(1, "apple"), (2, "orange"), (3, "mayonnaise")], ] old_head.checkout() engine = OUTPUT.engine for t in tables: assert not engine.table_exists(OUTPUT.to_schema(), t) head.checkout() for t, c in zip(tables, contents): assert sorted(OUTPUT.run_sql("SELECT * FROM %s" % t)) == sorted(c) for t in tables: objects = head.get_table(t).objects if t in ["my_fruits", "o_vegetables"]: assert all(o not in all_current_objects for o in objects) else: assert all(o in all_current_objects for o in objects)
def test_import_updating_splitfile_with_uploading(local_engine_empty, remote_engine, pg_repo_remote): execute_commands(load_splitfile("import_and_update.splitfile"), output=OUTPUT) head = OUTPUT.head assert len(OUTPUT.objects.get_all_objects() ) == 4 # Two original tables + two updates # Push with upload. Have to specify the remote repo. remote_output = Repository(OUTPUT.namespace, OUTPUT.repository, remote_engine) OUTPUT.push(remote_output, handler="S3", handler_options={}) # Unmount everything locally and cleanup OUTPUT.delete() # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object # manager for the engine (maybe should inject it into local_engine/remote_engine instead) OUTPUT.objects.cleanup() assert not OUTPUT.objects.get_all_objects() clone(OUTPUT.to_schema(), download_all=False) assert not OUTPUT.objects.get_downloaded_objects() existing_objects = list(OUTPUT.objects.get_all_objects()) assert len(existing_objects) == 4 # Two original tables + two updates # Only 2 objects are stored externally (the other two have been on the remote the whole time) assert len( OUTPUT.objects.get_external_object_locations(existing_objects)) == 2 head.checkout() assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]
def test_range_index_ordering_collation(local_engine_empty): # Test that range index gets min/max values of text columns using the "C" collation # (sort by byte values of characters) rather than anything else (e.g. in en-US # "a" comes before "B" even though "B" has a smaller ASCII code). OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key_1 INTEGER, key_2 VARCHAR," " value_1 VARCHAR, value_2 INTEGER, PRIMARY KEY (key_1, key_2))") OUTPUT.engine.run_sql_batch( "INSERT INTO test VALUES (%s, %s, %s, %s)", [ (1, "ONE", "apple", 4), (1, "one", "ORANGE", 3), (2, "two", "banana", 2), (2, "TWO", "CUCUMBER", 1), ], schema=OUTPUT.to_schema(), ) head = OUTPUT.commit() object_id = head.get_table("test").objects[0] assert OUTPUT.objects.get_object_meta([object_id ])[object_id].object_index == { "range": { "$pk": [[1, "ONE"], [2, "two"]], "key_1": [1, 2], "key_2": ["ONE", "two"], "value_1": ["CUCUMBER", "banana"], "value_2": [1, 4], } }
def test_diff_fragment_hashing_long_chain(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key TIMESTAMP PRIMARY KEY, val1 INTEGER, val2 VARCHAR, val3 NUMERIC)" ) OUTPUT.run_sql( "INSERT INTO TEST VALUES ('2019-01-01 01:01:01.111', 1, 'one', 1.1)," "('2019-01-02 02:02:02.222', 2, 'two', 2.2)," "('2019-01-03 03:03:03.333', 3, 'three', 3.3)," "('2019-01-04 04:04:04.444', 4, 'four', 4.4)") OUTPUT.commit() base = OUTPUT.head.get_table("test") OUTPUT.run_sql("DELETE FROM test WHERE key = '2019-01-03 03:03:03.333';" "INSERT INTO test VALUES ('2019-01-05', 5, 'five', 5.5)") OUTPUT.commit() v1 = OUTPUT.head.get_table("test") OUTPUT.run_sql( "UPDATE test SET val2 = 'UPDATED', val1 = 42 WHERE key = '2019-01-02 02:02:02.222'" ) OUTPUT.commit() v2 = OUTPUT.head.get_table("test") OUTPUT.run_sql( "UPDATE test SET val2 = 'UPDATED AGAIN', val1 = 43 WHERE key = '2019-01-02 02:02:02.222'" ) OUTPUT.commit() v3 = OUTPUT.head.get_table("test") om = OUTPUT.objects final_hash, rows_inserted = OUTPUT.objects.calculate_content_hash( OUTPUT.to_schema(), "test") assert rows_inserted == 4 schema_hash = om._calculate_schema_hash(base.table_schema) # Check that the final hash can be assembled out of intermediate objects' insertion and deletion hashes ins_hash_base, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, base.objects[0]) assert ("o" + sha256( (ins_hash_base.hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == base.objects[-1]) assert rows_inserted == 4 ins_hash_v1, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, v1.objects[-1]) assert rows_inserted == 1 # timestamp cast to text in a tuple is wrapped with double quotes in PG. # As long as hashing is consistent (this happens with all engines no matter what their conventions are), # we don't really mind but this might cause some weird issues later with verifying hashes/deduplication. del_hash_v1 = Digest.from_hex( sha256('("2019-01-03 03:03:03.333",3,three,3.3)'.encode( "ascii")).hexdigest()) assert del_hash_v1.hex( ) == "b12a93d54ba7ff1c2e26c92f01ac9c9d7716242eb47344d57c89b481227f5298" # Check that the object metadata contains the same hashes. v1_meta = om.get_object_meta(v1.objects)[v1.objects[-1]] assert ins_hash_v1.hex() == v1_meta.insertion_hash assert del_hash_v1.hex() == v1_meta.deletion_hash assert ("o" + sha256( ((ins_hash_v1 - del_hash_v1).hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == v1.objects[-1]) ins_hash_v2, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, v2.objects[-1]) del_hash_v2 = Digest.from_hex( sha256('("2019-01-02 02:02:02.222",2,two,2.2)'.encode( "ascii")).hexdigest()) assert del_hash_v2.hex( ) == "88e01be43523057d192b2fd65e69f651a9515b7e30d17a9fb852926b71e3bdff" assert ins_hash_v2.hex() == om.get_object_meta( v2.objects)[v2.objects[-1]].insertion_hash assert ("o" + sha256( ((ins_hash_v2 - del_hash_v2).hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == v2.objects[-1]) assert rows_inserted == 1 v2_meta = om.get_object_meta(v2.objects)[v2.objects[-1]] assert ins_hash_v2.hex() == v2_meta.insertion_hash assert del_hash_v2.hex() == v2_meta.deletion_hash ins_hash_v3, rows_inserted = om.calculate_fragment_insertion_hash_stats( SPLITGRAPH_META_SCHEMA, v3.objects[-1]) del_hash_v3 = Digest.from_hex( sha256('("2019-01-02 02:02:02.222",42,UPDATED,2.2)'.encode( "ascii")).hexdigest()) assert ("o" + sha256( ((ins_hash_v3 - del_hash_v3).hex() + schema_hash).encode("ascii")).hexdigest()[:-2] == v3.objects[-1]) v3_meta = om.get_object_meta(v3.objects)[v3.objects[-1]] assert ins_hash_v3.hex() == v3_meta.insertion_hash assert del_hash_v3.hex() == v3_meta.deletion_hash assert rows_inserted == 1 assert (ins_hash_base + ins_hash_v1 + ins_hash_v2 + ins_hash_v3 - del_hash_v1 - del_hash_v2 - del_hash_v3).hex() == final_hash