def test_from_remote(local_engine_empty, pg_repo_remote_multitag): # Test running commands that base new datasets on a remote repository. execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": "v1"}, output=OUTPUT) new_head = OUTPUT.head parent = OUTPUT.images.by_hash(new_head.parent_id) # Go back to the parent: the two source tables should exist there parent.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table") new_head.checkout() assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits") assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables") assert OUTPUT.run_sql("SELECT * FROM join_table") == [ (1, "apple", "potato"), (2, "orange", "carrot"), ] # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table) # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which # doesn't exist). OUTPUT.delete() execute_commands(load_splitfile("from_remote.splitfile"), params={"TAG": "v2"}, output=OUTPUT) assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange", "carrot")]
def test_import_updating_splitfile_with_uploading(local_engine_empty, remote_engine, pg_repo_remote): execute_commands(load_splitfile("import_and_update.splitfile"), output=OUTPUT) head = OUTPUT.head assert len(OUTPUT.objects.get_all_objects() ) == 4 # Two original tables + two updates # Push with upload. Have to specify the remote repo. remote_output = Repository(OUTPUT.namespace, OUTPUT.repository, remote_engine) OUTPUT.push(remote_output, handler="S3", handler_options={}) # Unmount everything locally and cleanup OUTPUT.delete() # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object # manager for the engine (maybe should inject it into local_engine/remote_engine instead) OUTPUT.objects.cleanup() assert not OUTPUT.objects.get_all_objects() clone(OUTPUT.to_schema(), download_all=False) assert not OUTPUT.objects.get_downloaded_objects() existing_objects = list(OUTPUT.objects.get_all_objects()) assert len(existing_objects) == 4 # Two original tables + two updates # Only 2 objects are stored externally (the other two have been on the remote the whole time) assert len( OUTPUT.objects.get_external_object_locations(existing_objects)) == 2 head.checkout() assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]