Esempio n. 1
0
def test_from_remote(local_engine_empty, pg_repo_remote_multitag):
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    new_head = OUTPUT.head
    parent = OUTPUT.images.by_hash(new_head.parent_id)
    # Go back to the parent: the two source tables should exist there
    parent.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table)
    # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which
    # doesn't exist).
    OUTPUT.delete()
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v2"},
                     output=OUTPUT)

    assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange",
                                                           "carrot")]
Esempio n. 2
0
def test_import_updating_splitfile_with_uploading(local_engine_empty,
                                                  remote_engine,
                                                  pg_repo_remote):
    execute_commands(load_splitfile("import_and_update.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head

    assert len(OUTPUT.objects.get_all_objects()
               ) == 4  # Two original tables + two updates

    # Push with upload. Have to specify the remote repo.
    remote_output = Repository(OUTPUT.namespace, OUTPUT.repository,
                               remote_engine)
    OUTPUT.push(remote_output, handler="S3", handler_options={})
    # Unmount everything locally and cleanup
    OUTPUT.delete()

    # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object
    # manager for the engine (maybe should inject it into local_engine/remote_engine instead)
    OUTPUT.objects.cleanup()
    assert not OUTPUT.objects.get_all_objects()

    clone(OUTPUT.to_schema(), download_all=False)

    assert not OUTPUT.objects.get_downloaded_objects()
    existing_objects = list(OUTPUT.objects.get_all_objects())
    assert len(existing_objects) == 4  # Two original tables + two updates
    # Only 2 objects are stored externally (the other two have been on the remote the whole time)
    assert len(
        OUTPUT.objects.get_external_object_locations(existing_objects)) == 2

    head.checkout()
    assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]