def test_base_fragment_reused_chunking(local_engine_empty): # Check that if we split a table into chunks and some chunks are the same, they get assigned to the same objects. OUTPUT.init() base = OUTPUT.head _make_test_table(OUTPUT) OUTPUT.commit(chunk_size=5) table_1 = OUTPUT.head.get_table("test") # Table 1 produced 3 objects assert len(OUTPUT.objects.get_all_objects()) == 3 # All chunks are the same base.checkout() _make_test_table(OUTPUT) OUTPUT.commit(chunk_size=5) table_2 = OUTPUT.head.get_table("test") assert len(OUTPUT.objects.get_all_objects()) == 3 assert table_1.objects == table_2.objects # Insert something else into the middle chunk so that it's different. This will get conflated so won't get recorded # as an update. base.checkout() _make_test_table(OUTPUT) OUTPUT.run_sql( "UPDATE test SET value_1 = 'UPDATED', value_2 = 42 WHERE key = 7") OUTPUT.commit(chunk_size=5) table_3 = OUTPUT.head.get_table("test") assert len(OUTPUT.objects.get_all_objects()) == 4 # Table 3 reused the first and the last object but created a new one for the middle fragment. assert len(table_3.objects) == 3 assert table_3.objects[0] == table_1.objects[0] assert table_3.objects[1] != table_1.objects[1] assert table_3.objects[2] == table_1.objects[2]
def _setup_dataset(): OUTPUT.init() OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar); INSERT INTO test VALUES (1, 'test')""") OUTPUT.commit() OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')") return OUTPUT.commit()
def test_from_remote(local_engine_empty, pg_repo_remote_multitag): # Test running commands that base new datasets on a remote repository. execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": "v1"}, output=OUTPUT) new_head = OUTPUT.head parent = OUTPUT.images.by_hash(new_head.parent_id) # Go back to the parent: the two source tables should exist there parent.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") new_head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ] # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table) # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which # doesn't exist). OUTPUT.delete() execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": "v2"}, output=OUTPUT) assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange", "carrot")]
def test_import_bare(pg_repo_local): # Check import without checking anything out, just by manipulating metadata and running LQs against # source images. # Create a new schema and import 'fruits' OUTPUT.init() # Make sure the existing table is preserved. OUTPUT.run_sql("CREATE TABLE sentinel (key INTEGER)") OUTPUT.commit() pg_repo_local.uncheckout() OUTPUT.uncheckout() OUTPUT.import_tables( tables=["imported_fruits"], source_repository=pg_repo_local, image_hash=pg_repo_local.images["latest"].image_hash, source_tables=["SELECT * FROM fruits WHERE fruit_id = 1"], parent_hash=OUTPUT.images["latest"].image_hash, do_checkout=False, table_queries=[True], ) assert OUTPUT.head is None assert pg_repo_local.head is None assert sorted(OUTPUT.images["latest"].get_tables()) == [ "imported_fruits", "sentinel" ] assert list(OUTPUT.images["latest"].get_table("imported_fruits").query( columns=["name"], quals=[])) == [{ "name": "apple" }]
def test_import_and_update(local_engine_empty, unprivileged_pg_repo): OUTPUT.init() head = OUTPUT.head remote_head = unprivileged_pg_repo.images["latest"] # Import the 'fruits' table from the origin. import_table_from_remote(unprivileged_pg_repo, ["fruits"], remote_head.image_hash, OUTPUT, target_tables=[]) new_head = OUTPUT.head OUTPUT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") new_head_2 = OUTPUT.commit() head.checkout() assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") new_head.checkout() assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")] new_head_2.checkout() assert OUTPUT.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]
def test_bloom_index_post_factum(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)") for i in range(50): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)", (i + 1, dt(2015, 1, 1) + timedelta(days=i))) head = OUTPUT.commit() # Reindex the table after committing it and check that object metadata gets overwritten. head.get_table("test").reindex( extra_indexes={"bloom": { "value_1": { "probability": 0.01 } }}) obj = head.get_table("test").objects[0] object_index = OUTPUT.objects.get_object_meta([obj])[obj].object_index assert "bloom" in object_index # Run a snippet from the previous test to check the bloom metadata is valid. objects = head.get_table("test").objects for i in range(0, 50, 5): assert (filter_bloom_index(OUTPUT.engine, objects, [[ ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i)) ]]) == objects)
def test_bloom_index_datetime(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)") for i in range(50): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)", (i + 1, dt(2015, 1, 1) + timedelta(days=i))) head = OUTPUT.commit( extra_indexes={"test": { "bloom": { "value_1": { "probability": 0.01 } } }}) objects = head.get_table("test").objects # Datetimes are supported in the bloom index if they're passed in as actual datetime # objects (which Multicorn does). # Spot check some dates that exist... for i in range(0, 50, 5): assert (filter_bloom_index(OUTPUT.engine, objects, [[ ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i)) ]]) == objects) # ...and some that don't. assert (filter_bloom_index( OUTPUT.engine, objects, [[("value_1", "=", dt(2015, 1, 1) + timedelta(days=55))]]) == []) # They also work if passed in as ISO strings with space as a separator. assert (filter_bloom_index( OUTPUT.engine, objects, [[("value_1", "=", "2015-01-01 00:00:00")]]) == objects)
def test_update_without_import_splitfile(pg_repo_local): # Test that correct commits are produced by executing an splitfile (both against newly created and already # existing tables on an existing mountpoint) execute_commands(load_splitfile("update_without_import.splitfile"), output=OUTPUT) log = OUTPUT.head.get_log() log[1].checkout() assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [] log[0].checkout() assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple")]
def test_basic_splitfile(pg_repo_local): execute_commands(load_splitfile("create_table.splitfile"), output=OUTPUT) log = list(reversed(OUTPUT.head.get_log())) log[1].checkout() assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [] log[2].checkout() assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple")] log[3].checkout() assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple"), (2, "banana")]
def test_rollback_on_error(local_engine_empty): # For e.g. commit/checkout/other commands, we don't do commits/rollbacks # in the library itself and expect the caller to manage transactions. In CLI, # we need to make sure that erroneous transactions (e.g. interrupted SG commits) # are rolled back correctly instead of being committed. runner = CliRunner() OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) for i in range(11): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) head = OUTPUT.commit(chunk_size=5, in_fragment_order={"test": ["key", "value_1"]}) assert len(OUTPUT.images()) == 2 assert len(OUTPUT.objects.get_all_objects()) == 3 _alter_diff_splitting_dataset() OUTPUT.commit_engines() # Simulate the commit getting interrupted by the first object going through and being # recorded, then a KeyboardInterrupt being raised. called_once = False def interrupted_register(*args, **kwargs): nonlocal called_once if called_once: raise BaseException("something went wrong") else: called_once = True return FragmentManager._register_object(*args, **kwargs) with patch( "splitgraph.core.fragment_manager.FragmentManager._register_object", side_effect=interrupted_register, ) as ro: with pytest.raises(BaseException): runner.invoke(cli, ["commit", OUTPUT.to_schema()]) # Check that no image/object metadata was written assert len(OUTPUT.images()) == 2 assert len(OUTPUT.objects.get_all_objects()) == 3 assert ro.call_count == 2 # Check that the data in the audit trigger wasn't deleted assert len( OUTPUT.engine.get_pending_changes(OUTPUT.to_schema(), table="test")) == 6
def test_import_preserves_pending_changes(pg_repo_local): OUTPUT.init() OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar); INSERT INTO test VALUES (1, 'test')""") head = OUTPUT.commit() OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')") changes = get_engine().get_pending_changes(OUTPUT.to_schema(), "test") OUTPUT.import_tables(tables=["imported_fruits"], source_repository=pg_repo_local, source_tables=["fruits"]) assert OUTPUT.head.parent_id == head.image_hash assert changes == OUTPUT.engine.get_pending_changes( OUTPUT.to_schema(), "test")
def test_import_from_remote(local_engine_empty, unprivileged_pg_repo): # Start with a clean repo -- add a table to output to see if it's preserved. head = _setup_dataset() local_objects = OUTPUT.objects assert len(local_objects.get_downloaded_objects()) == 2 assert len(local_objects.get_all_objects()) == 2 assert local_engine_empty.get_all_tables(OUTPUT.to_schema()) == ["test"] # Import the 'fruits' table from the origin. remote_head = unprivileged_pg_repo.images["latest"] import_table_from_remote(unprivileged_pg_repo, ["fruits"], remote_head.image_hash, OUTPUT, target_tables=[]) new_head = OUTPUT.head # Check that the table now exists in the output, is committed and there's no trace of the cloned repo. # Also clean up the unused objects to make sure that the newly cloned table is still recorded. assert sorted(local_engine_empty.get_all_tables( OUTPUT.to_schema())) == ["fruits", "test"] local_objects.cleanup() assert len(get_current_repositories(local_engine_empty)) == 1 head.checkout() assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test") assert not local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits") new_head.checkout() assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test") assert local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")]
def test_splitfile_remote(local_engine_empty, pg_repo_remote_multitag): # We use the v1 tag when importing from the remote, so fruit_id = 1 still exists there. execute_commands(load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT) assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ] # Now run the commands against v2 and make sure the fruit_id = 1 has disappeared from the output. execute_commands(load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v2"}, output=OUTPUT) assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (2, "orange", "carrot") ]
def test_splitfile_inline_sql(readonly_pg_repo, pg_repo_local): # Test SQL commands accessing repos directly -- join a remote repo with # some local data. prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) pg_repo_local.head.tag("v2") execute_commands( load_splitfile("inline_sql.splitfile"), output=OUTPUT, ) new_head = OUTPUT.head new_head.checkout() assert new_head.get_tables() == ["balanced_diet"] assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [ (1, "apple", None, "potato"), (2, "orange", datetime.datetime(2019, 1, 1, 12, 0), "carrot"), ] local_repo_head = pg_repo_local.head.image_hash other_repo_head = readonly_pg_repo.images["latest"].image_hash assert new_head.provenance_data == [ { "sources": [ { "source": "pg_mount", "source_hash": other_repo_head, "source_namespace": "otheruser", }, { "source": "pg_mount", "source_hash": local_repo_head, "source_namespace": "test" }, ], "sql": ("CREATE TABLE balanced_diet\n" " AS SELECT fruits.fruit_id AS id\n" " , fruits.name AS fruit\n" " , my_fruits.timestamp AS timestamp\n" " , vegetables.name AS vegetable\n" " FROM " '"otheruser/pg_mount:{0}".fruits ' "AS fruits\n" " INNER JOIN " '"otheruser/pg_mount:{0}".vegetables ' "AS vegetables ON fruits.fruit_id = vegetable_id\n" " LEFT JOIN " '"test/pg_mount:{1}".fruits ' "AS my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n" "\n" "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)").format( other_repo_head, local_repo_head), "type": "SQL", }, ]
def test_layered_querying_json_arrays(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, " "value JSONB, arr_value INTEGER[], arr_2d_value TEXT[][])" ) OUTPUT.run_sql( "INSERT INTO test VALUES (1, %s, %s, %s)", (json.dumps({"a": 1, "b": 2.5}), [1, 2, 3], [["one", "two"], ["three", "four"]]), ) OUTPUT.commit() OUTPUT.run_sql( "INSERT INTO test VALUES (2, %s, %s, %s)", ( json.dumps({"a": "one", "b": "two point five"}), [4, 5, 6], [["five", "six"], ["seven", "eight"]], ), ) head = OUTPUT.commit() OUTPUT.uncheckout() head.checkout(layered=True) assert OUTPUT.run_sql("SELECT * FROM test ORDER BY key") == [ (1, {"a": 1, "b": 2.5}, [1, 2, 3], [["one", "two"], ["three", "four"]]), (2, {"a": "one", "b": "two point five"}, [4, 5, 6], [["five", "six"], ["seven", "eight"]]), ]
def test_advanced_splitfile(pg_repo_local): execute_commands( load_splitfile("import_local_multiple_with_queries.splitfile"), output=OUTPUT) assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") head = OUTPUT.head old_head = head.parent_id OUTPUT.images.by_hash(old_head).checkout() assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") head.checkout() assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (2, "orange", "carrot") ] assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(2, "orange")]
def test_splitfile_remote_hash(local_engine_empty, pg_repo_remote): head = pg_repo_remote.head.image_hash execute_commands(load_splitfile("import_remote_multiple.splitfile"), params={"TAG": head[:10]}, output=OUTPUT) assert OUTPUT.run_sql( "SELECT id, fruit, vegetable FROM output.join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ]
def test_splitfile(local_engine_empty, pg_repo_remote): runner = CliRunner() result = runner.invoke( build_c, [ RESOURCES + "import_remote_multiple.splitfile", "-a", "TAG", "latest", "-o", "output", ], ) assert result.exit_code == 0 assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ] # Test the sgr provenance command. First, just list the dependencies of the new image. result = runner.invoke(provenance_c, ["output:latest"]) assert "test/pg_mount:%s" % pg_repo_remote.images[ "latest"].image_hash in result.output # Second, output the full splitfile (-f) result = runner.invoke(provenance_c, ["output:latest", "-f"]) assert ("FROM test/pg_mount:%s IMPORT" % pg_repo_remote.images["latest"].image_hash in result.output) assert "SQL {CREATE TABLE join_table" in result.output # Test reverse dependencies # We're looking at test/pg_mount on the local engine which doesn't exist -- this should fail. result = runner.invoke( dependents_c, [ "test/pg_mount:%s" % pg_repo_remote.images["latest"].image_hash, ], ) assert result.exit_code == 1 # Now look at test/pg_mount on the remote and look for dependents on the local engine. result = runner.invoke( dependents_c, [ "test/pg_mount:%s" % pg_repo_remote.images["latest"].image_hash, "--dependents-on", "LOCAL", "--source-on", pg_repo_remote.engine.name, ], ) assert result.exit_code == 0 assert "is depended on by" in result.output assert "%s:%s" % (OUTPUT, OUTPUT.head.image_hash) in result.output
def test_splitfile_schema_changes(pg_repo_local, mg_repo_local): execute_commands(load_splitfile("schema_changes.splitfile"), output=OUTPUT) old_output_head = OUTPUT.head # Then, alter the dataset and rerun the splitfile. pg_repo_local.run_sql("INSERT INTO fruits VALUES (12, 'mayonnaise')") pg_repo_local.commit() execute_commands(load_splitfile("schema_changes.splitfile"), output=OUTPUT) new_output_head = OUTPUT.head old_output_head.checkout() assert OUTPUT.run_sql("SELECT * FROM spirit_fruits") == [("James", "orange", 12)] new_output_head.checkout() # Mayonnaise joined with Alex, ID 12 + 10 = 22. assert OUTPUT.run_sql("SELECT * FROM spirit_fruits") == [ ("James", "orange", 12), ("Alex", "mayonnaise", 22), ]
def test_splitfile_with_external_sql(readonly_pg_repo): # Tests are running from root so we pass in the path to the SQL manually to the splitfile. execute_commands( load_splitfile("external_sql.splitfile"), params={"EXTERNAL_SQL_FILE": RESOURCES + "external_sql.sql"}, output=OUTPUT, ) assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ]
def test_from_remote_hash(local_engine_empty, pg_repo_remote): head = pg_repo_remote.head.image_hash # Test running commands that base new datasets on a remote repository. execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": head[:10]}, output=OUTPUT) assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ]
def test_import_basic(pg_repo_local): # Create a new schema and import 'fruits' from the mounted PG table. OUTPUT.init() head = OUTPUT.head OUTPUT.import_tables(tables=["imported_fruits"], source_repository=pg_repo_local, source_tables=["fruits"]) assert OUTPUT.run_sql("SELECT * FROM imported_fruits" ) == pg_repo_local.run_sql("SELECT * FROM fruits") new_head = OUTPUT.head assert new_head != head assert new_head.parent_id == head.image_hash
def test_import_multiple_tables(pg_repo_local): OUTPUT.init() head = OUTPUT.head OUTPUT.import_tables(tables=[], source_repository=pg_repo_local, source_tables=[]) for table_name in ["fruits", "vegetables"]: assert OUTPUT.run_sql("SELECT * FROM %s" % table_name) == pg_repo_local.run_sql( "SELECT * FROM %s" % table_name) new_head = OUTPUT.head assert new_head != head assert new_head.parent_id == head.image_hash
def test_range_index_ordering_collation(local_engine_empty): # Test that range index gets min/max values of text columns using the "C" collation # (sort by byte values of characters) rather than anything else (e.g. in en-US # "a" comes before "B" even though "B" has a smaller ASCII code). OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key_1 INTEGER, key_2 VARCHAR," " value_1 VARCHAR, value_2 INTEGER, PRIMARY KEY (key_1, key_2))") OUTPUT.engine.run_sql_batch( "INSERT INTO test VALUES (%s, %s, %s, %s)", [ (1, "ONE", "apple", 4), (1, "one", "ORANGE", 3), (2, "two", "banana", 2), (2, "TWO", "CUCUMBER", 1), ], schema=OUTPUT.to_schema(), ) head = OUTPUT.commit() object_id = head.get_table("test").objects[0] assert OUTPUT.objects.get_object_meta([object_id ])[object_id].object_index == { "range": { "$pk": [[1, "ONE"], [2, "two"]], "key_1": [1, 2], "key_2": ["ONE", "two"], "value_1": ["CUCUMBER", "banana"], "value_2": [1, 4], } }
def test_import_all(local_engine_empty): execute_commands(load_splitfile("import_all_from_mounted.splitfile"), output=OUTPUT) head = OUTPUT.head old_head = OUTPUT.images.by_hash(head.parent_id) old_head.checkout() tables = ["vegetables", "fruits"] contents = [[(1, "potato"), (2, "carrot")], [(1, "apple"), (2, "orange")]] for t in tables: assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), t) head.checkout() for t, c in zip(tables, contents): assert OUTPUT.run_sql("SELECT * FROM %s" % t) == c
def test_splitfile_object_download_failure(local_engine_empty, pg_repo_remote_multitag): # Simulate an object download failure (that happens inside of the engine during IMPORT # execution) propagating to the caller and not leaving the engine in an inconsistent state. object_id = pg_repo_remote_multitag.images["v1"].get_table("fruits").objects[0] assert object_id == "o0e742bd2ea4927f5193a2c68f8d4c51ea018b1ef3e3005a50727147d2cf57b" tmp_object_id = "o" + "0" * 62 pg_repo_remote_multitag.engine.run_sql( SQL("ALTER TABLE splitgraph_meta.{} RENAME TO {}").format( Identifier(object_id), Identifier(tmp_object_id) ) ) assert len(OUTPUT.images()) == 0 assert _get_table_count(OUTPUT) == 0 with pytest.raises(ObjectCacheError) as e: execute_commands( load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT ) assert "Missing 1 object (%s)" % object_id in str(e.value) # Check the execution didn't create the image assert len(OUTPUT.images()) == 0 assert _get_table_count(OUTPUT) == 0 # Rename the object back and retry the Splitfile pg_repo_remote_multitag.engine.run_sql( SQL("ALTER TABLE splitgraph_meta.{} RENAME TO {}").format( Identifier(tmp_object_id), Identifier(object_id) ) ) execute_commands( load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT ) OUTPUT.head.checkout() assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ] assert len(OUTPUT.images()) == 3 # 2 tables in the first non-empty image, 3 tables in the second image # (previous 2 + joined data). assert _get_table_count(OUTPUT) == 5
def test_from_local(pg_repo_local): execute_commands(load_splitfile("from_local.splitfile"), output=OUTPUT) new_head = OUTPUT.head # Go back to the parent: the two source tables should exist there OUTPUT.images.by_hash(new_head.parent_id).checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") new_head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ]
def test_import_with_custom_query(pg_repo_local): # Test that importing with a custom query creates a new object pg_repo_local.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise');" "INSERT INTO vegetables VALUES (3, 'oregano')") pg_repo_local.commit() all_current_objects = pg_repo_local.objects.get_all_objects() execute_commands(load_splitfile("import_with_custom_query.splitfile"), output=OUTPUT) head = OUTPUT.head old_head = OUTPUT.images.by_hash(head.parent_id) # First two tables imported as new objects since they had a custom query, the other two get pointed # to the old pg_repo_local objects. tables = ["my_fruits", "o_vegetables", "vegetables", "all_fruits"] contents = [ [(2, "orange")], [(1, "potato"), (3, "oregano")], [(1, "potato"), (2, "carrot"), (3, "oregano")], [(1, "apple"), (2, "orange"), (3, "mayonnaise")], ] old_head.checkout() engine = OUTPUT.engine for t in tables: assert not engine.table_exists(OUTPUT.to_schema(), t) head.checkout() for t, c in zip(tables, contents): assert sorted(OUTPUT.run_sql("SELECT * FROM %s" % t)) == sorted(c) for t in tables: objects = head.get_table(t).objects if t in ["my_fruits", "o_vegetables"]: assert all(o not in all_current_objects for o in objects) else: assert all(o in all_current_objects for o in objects)
def test_import_updating_splitfile_with_uploading(local_engine_empty, remote_engine, pg_repo_remote): execute_commands(load_splitfile("import_and_update.splitfile"), output=OUTPUT) head = OUTPUT.head assert len(OUTPUT.objects.get_all_objects() ) == 4 # Two original tables + two updates # Push with upload. Have to specify the remote repo. remote_output = Repository(OUTPUT.namespace, OUTPUT.repository, remote_engine) OUTPUT.push(remote_output, handler="S3", handler_options={}) # Unmount everything locally and cleanup OUTPUT.delete() # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object # manager for the engine (maybe should inject it into local_engine/remote_engine instead) OUTPUT.objects.cleanup() assert not OUTPUT.objects.get_all_objects() clone(OUTPUT.to_schema(), download_all=False) assert not OUTPUT.objects.get_downloaded_objects() existing_objects = list(OUTPUT.objects.get_all_objects()) assert len(existing_objects) == 4 # Two original tables + two updates # Only 2 objects are stored externally (the other two have been on the remote the whole time) assert len( OUTPUT.objects.get_external_object_locations(existing_objects)) == 2 head.checkout() assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]
def test_bloom_reindex_changed_table(local_engine_empty): OUTPUT.init() OUTPUT.run_sql( "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)" ) for i in range(26): OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)", (i + 1, chr(ord("a") + i), i * 2)) OUTPUT.commit(chunk_size=13) OUTPUT.run_sql("DELETE FROM test WHERE key = 5") # ('e', 8) head = OUTPUT.commit() objects = head.get_table("test").objects assert len(objects) == 3 # original 2 fragments and one overwrite index_spec = { "bloom": { "value_1": { "probability": 0.01 }, "value_2": { "probability": 0.01 } } } # Since the patch object deletes the old value, we don't immediately know what it was and so # can't reindex that object. with pytest.raises(ObjectIndexingError) as e: head.get_table("test").reindex(extra_indexes=index_spec) assert "1 object" in str(e.value) reindexed = head.get_table("test").reindex(extra_indexes=index_spec, raise_on_patch_objects=False) assert objects[0] in reindexed assert objects[1] in reindexed assert objects[2] not in reindexed