Example #1
0
def test_push_own_delete_own(local_engine_empty, unprivileged_pg_repo):
    destination = Repository.from_template(unprivileged_pg_repo,
                                           engine=local_engine_empty)
    clone(unprivileged_pg_repo, local_repository=destination)

    destination.images["latest"].checkout()
    destination.run_sql(
        """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""")
    destination.commit()

    # Test we can push to our namespace -- can't upload the object to the splitgraph_meta since we can't create
    # tables there
    remote_destination = Repository.from_template(
        destination,
        namespace=unprivileged_pg_repo.engine.conn_params["SG_NAMESPACE"],
        engine=unprivileged_pg_repo.engine,
    )
    destination.upstream = remote_destination

    destination.push(handler="S3")
    # Test we can delete a single image from our own repo
    assert len(remote_destination.images()) == 3
    remote_destination.images.delete([destination.images["latest"].image_hash])
    assert len(remote_destination.images()) == 2

    # Test we can delete our own repo once we've pushed it
    remote_destination.delete()
    assert len(remote_destination.images()) == 0
def test_pull_download_error(local_engine_empty, unprivileged_pg_repo,
                             clean_minio, interrupted):
    # Same test backwards: if we're pulling and abort or fail the download, make sure we can
    # recover and retry pulling the repo.

    with patch.dict(
            "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS",
        {"S3": _flaky_handler(interrupted)},
    ):
        with pytest.raises(Exception) as e:
            clone(unprivileged_pg_repo,
                  local_repository=PG_MNT,
                  download_all=True)

    # Check that the pull succeeded (repository registered locally) but the objects
    # are just marked as external, not downloaded
    assert repository_exists(PG_MNT)
    assert len(PG_MNT.objects.get_all_objects()) == 2
    assert len(PG_MNT.objects.get_downloaded_objects()) == 1
    assert len(
        PG_MNT.objects.get_external_object_locations(
            PG_MNT.objects.get_all_objects())) == 2
    assert (PG_MNT.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status",
        return_shape=ResultShape.ONE_ONE,
    ) == 1)

    clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=True)
    assert len(PG_MNT.objects.get_all_objects()) == 2
    assert len(PG_MNT.objects.get_downloaded_objects()) == 2
    assert len(list(PG_MNT.images)) == 2
    assert (PG_MNT.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status",
        return_shape=ResultShape.ONE_ONE,
    ) == 2)
Example #3
0
def test_pull(local_engine_empty, pg_repo_remote, download_all):
    # Pull the schema from the remote
    # Here, it's the pg on local_engine that connects to the remote engine, so we can use the actual hostname
    # (as opposed to the one exposed to us). However, the clone procedure also uses that connection string to talk to
    # the remote. Hence, there's an /etc/hosts indirection on the host mapping the remote engine to localhost.
    clone(pg_repo_remote, local_repository=PG_MNT, download_all=download_all)
    PG_MNT.images.by_hash(pg_repo_remote.head.image_hash).checkout()

    head_1 = _add_image_to_repo(pg_repo_remote)

    # Check the data is unchanged on the pulled one.
    assert PG_MNT.run_sql("SELECT * FROM fruits") == [(1, "apple"),
                                                      (2, "orange")]

    with pytest.raises(ImageNotFoundError):
        PG_MNT.images.by_hash(head_1.image_hash)

    PG_MNT.pull()
    head_1 = PG_MNT.images.by_hash(head_1.image_hash)

    # Check out the newly-pulled commit and verify it has the same data.
    head_1.checkout()

    assert PG_MNT.run_sql("SELECT * FROM fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
    assert PG_MNT.head == head_1
Example #4
0
def clone_c(remote_repository_or_image, local_repository, remote, download_all,
            overwrite_object_meta, tags):
    """
    Clone a remote Splitgraph repository/image into a local one.

    The lookup path for the repository is governed by the ``SG_REPO_LOOKUP`` and ``SG_REPO_LOOKUP_OVERRIDE``
    config parameters and can be overridden by the command line ``--remote`` option.
    """
    from splitgraph.core.repository import Repository
    from splitgraph.engine import get_engine
    from splitgraph.core.repository import clone

    remote_repository, image = remote_repository_or_image

    # If the user passed in a remote, we can inject that into the repository spec.
    # Otherwise, we have to turn the repository into a string and let clone() look up the
    # actual engine the repository lives on.
    if remote:
        remote_repository = Repository.from_template(remote_repository,
                                                     engine=get_engine(remote))
    else:
        remote_repository = remote_repository.to_schema()

    clone(
        remote_repository,
        local_repository=local_repository,
        download_all=download_all,
        single_image=image,
        overwrite_objects=overwrite_object_meta,
        overwrite_tags=tags,
    )
def test_s3_presigned_url(local_engine_empty, unprivileged_pg_repo,
                          clean_minio):
    # Test the URL signing stored procedure works on the remote machine
    clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False)
    PG_MNT.images["latest"].checkout()
    PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    head = PG_MNT.commit()
    object_id = head.get_table("fruits").objects[0]

    # Do a test calling the signer locally (the tests currently have access
    # to the S3 credentials on the host they're running on)
    urls_local = get_object_upload_urls("%s:%s" % (S3_HOST, S3_PORT),
                                        [object_id])
    assert len(urls_local) == 1
    assert len(urls_local[0]) == 3
    urls_local = get_object_download_urls("%s:%s" % (S3_HOST, S3_PORT),
                                          [object_id])
    assert len(urls_local) == 1
    assert len(urls_local[0]) == 3

    urls = unprivileged_pg_repo.engine.run_sql(
        "SELECT * FROM splitgraph_api.get_object_upload_urls(%s, %s)",
        ("%s:%s" % (S3_HOST, S3_PORT), [object_id]),
        return_shape=ResultShape.ONE_ONE,
    )
    assert len(urls) == 1
    assert len(urls[0]) == 3
Example #6
0
def _get_local_image_for_import(hash_or_tag: str,
                                repository: Repository) -> Tuple[Image, bool]:
    """
    Converts a remote repository and tag into an Image object that exists on the engine,
    optionally pulling the repository or cloning it into a temporary location.

    :param hash_or_tag: Hash/tag
    :param repository: Name of the repository (doesn't need to be local)
    :return: Image object and a boolean flag showing whether the repository should be deleted
    when the image is no longer needed.
    """
    tmp_repo = Repository(repository.namespace,
                          repository.repository + "_tmp_clone")
    repo_is_temporary = False

    logging.info("Resolving repository %s", repository)
    source_repo = lookup_repository(repository.to_schema(), include_local=True)
    if source_repo.engine.name != "LOCAL":
        clone(source_repo, local_repository=tmp_repo, download_all=False)
        source_image = tmp_repo.images[hash_or_tag]
        repo_is_temporary = True
    else:
        # For local repositories, first try to pull them to see if they are clones of a remote.
        if source_repo.upstream:
            source_repo.pull()
        source_image = source_repo.images[hash_or_tag]

    return source_image, repo_is_temporary
Example #7
0
def _setup_object_cache_test(pg_repo_remote, longer_chain=False):
    pg_repo_local = clone(pg_repo_remote)
    pg_repo_local.images["latest"].checkout()
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    if longer_chain:
        pg_repo_local.run_sql("INSERT INTO FRUITS VALUES (4, 'kumquat')")
        pg_repo_local.commit()

    # Same setup as the LQ test in the beginning: we clone a repo from upstream, don't download anything, all
    # objects are on Minio.
    remote = pg_repo_local.push(handler="S3", handler_options={})
    pg_repo_local.delete()
    pg_repo_remote.objects.delete_objects(
        remote.objects.get_downloaded_objects())
    pg_repo_remote.commit_engines()
    pg_repo_local.objects.cleanup()
    pg_repo_local = clone(pg_repo_remote, download_all=False)

    # 6 objects in the tree (original fragment, new base fragment and a patch on top of that fragment
    # for both tables)
    assert len(pg_repo_local.objects.get_all_objects()
               ) == 6 if not longer_chain else 7
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0
    assert len(
        remote.objects.get_all_objects()) == 6 if not longer_chain else 7
    assert len(remote.objects.get_downloaded_objects()) == 0

    # Nothing has yet been downloaded (cache entries only for externally downloaded things)
    assert (len(
        pg_repo_local.engine.run_sql(
            "SELECT * FROM splitgraph_meta.object_cache_status")) == 0)

    return pg_repo_local
Example #8
0
def test_push(local_engine_empty, pg_repo_remote):
    # Clone from the remote engine like in the previous test.
    clone(pg_repo_remote, local_repository=PG_MNT)

    remote_head = pg_repo_remote.head
    PG_MNT.images.by_hash(remote_head.image_hash).checkout()

    # Then, change our copy and commit.
    PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    head_1 = PG_MNT.commit()

    # Now, push to remote.
    PG_MNT.push(remote_repository=pg_repo_remote)

    # See if the original mountpoint got updated.
    assert len(pg_repo_remote.objects.get_all_objects()) == 3

    pg_repo_remote.images.by_hash(head_1.image_hash).checkout()
    assert pg_repo_remote.run_sql("SELECT * FROM fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]

    # Recommit the local image as a full snap and push it out.
    head_2 = PG_MNT.commit(snap_only=True)
    PG_MNT.push(remote_repository=pg_repo_remote)
    assert head_2.get_table(
        "fruits").objects[0] in pg_repo_remote.objects.get_all_objects()

    # Recommit it again, changing the sort order
    head_3 = PG_MNT.commit(snap_only=True,
                           in_fragment_order={"fruits": ["name"]},
                           overwrite=True)
    assert head_3.get_table("fruits").objects == head_2.get_table(
        "fruits").objects

    assert PG_MNT.run_sql(
        SQL("SELECT fruit_id FROM {}.{}").format(
            Identifier(SPLITGRAPH_META_SCHEMA),
            Identifier(head_2.get_table("fruits").objects[0])),
        return_shape=ResultShape.MANY_ONE,
    ) == [1, 3, 2]

    # Force push overwriting object meta and the actual object
    PG_MNT.push(
        remote_repository=pg_repo_remote,
        single_image=head_3.image_hash,
        overwrite_objects=True,
        reupload_objects=True,
    )

    assert pg_repo_remote.run_sql(
        SQL("SELECT fruit_id FROM {}.{}").format(
            Identifier(SPLITGRAPH_META_SCHEMA),
            Identifier(head_2.get_table("fruits").objects[0])),
        return_shape=ResultShape.MANY_ONE,
    ) == [1, 3, 2]
Example #9
0
def readonly_pg_repo(unprivileged_remote_engine, pg_repo_remote_registry):
    target = Repository.from_template(pg_repo_remote_registry, namespace=READONLY_NAMESPACE)
    clone(pg_repo_remote_registry, target)
    pg_repo_remote_registry.delete(uncheckout=False)
    pg_repo_remote_registry.engine.run_sql(
        "UPDATE splitgraph_meta.objects SET namespace=%s WHERE namespace=%s",
        (READONLY_NAMESPACE, REMOTE_NAMESPACE),
    )
    pg_repo_remote_registry.engine.commit()
    yield Repository.from_template(target, engine=unprivileged_remote_engine)
Example #10
0
def test_pull_single_image(local_engine_empty, pg_repo_remote, download_all):
    head = pg_repo_remote.head
    head_1 = _add_image_to_repo(pg_repo_remote)

    head.tag("tag_1")
    head_1.tag("tag_2")
    pg_repo_remote.commit_engines()

    # Clone a single image first
    assert len(PG_MNT.images()) == 0
    assert len(PG_MNT.objects.get_downloaded_objects()) == 0
    assert len(pg_repo_remote.images()) == 3
    clone(
        pg_repo_remote,
        local_repository=PG_MNT,
        download_all=download_all,
        single_image=head.image_hash[:12],
    )

    # Check only one image got downloaded and check we didn't try
    # to pull tags for images that we weren't pulling.
    assert len(PG_MNT.images()) == 1
    assert PG_MNT.images()[0] == head
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None

    # Try doing the same thing again
    clone(
        pg_repo_remote,
        local_repository=PG_MNT,
        download_all=download_all,
        single_image=head.image_hash[:12],
    )
    assert len(PG_MNT.images()) == 1

    # If we're downloading objects too, check only the original objects got downloaded
    if download_all:
        assert len(PG_MNT.objects.get_downloaded_objects()) == 2

    # Pull the remainder of the repo
    PG_MNT.pull(single_image=head_1.image_hash, download_all=download_all)
    assert len(PG_MNT.images()) == 2
    if download_all:
        assert len(PG_MNT.objects.get_downloaded_objects()) == 3

    assert PG_MNT.images["tag_2"] == head_1

    # Pull the whole repo
    PG_MNT.pull()
    assert len(PG_MNT.images()) == 3
Example #11
0
def _execute_from(
        node: Node,
        output: Repository) -> Tuple[Repository, Optional[ProvenanceLine]]:
    interesting_nodes = extract_nodes(node, ["repo_source", "repository"])
    repo_source = get_first_or_none(interesting_nodes, "repo_source")
    output_node = get_first_or_none(interesting_nodes, "repository")
    provenance: Optional[ProvenanceLine] = None

    if output_node:
        # AS (output) detected, change the current output repository to it.
        output = Repository.from_schema(output_node.match.group(0))
        logging.info("Changed output repository to %s" % str(output))

        # NB this destroys all data in the case where we ran some commands in the Splitfile and then
        # did FROM (...) without AS repository
        if repository_exists(output):
            logging.info("Clearing all output from %s" % str(output))
            output.delete()
    if not repository_exists(output):
        output.init()
    if repo_source:
        repository, tag_or_hash = parse_image_spec(repo_source)
        source_repo = lookup_repository(repository.to_schema(),
                                        include_local=True)

        if source_repo.engine.name == "LOCAL":
            # For local repositories, make sure to update them if they've an upstream
            if source_repo.upstream:
                source_repo.pull()

        # Get the target image hash from the source repo: otherwise, if the tag is, say, 'latest' and
        # the output has just had the base commit (000...) created in it, that commit will be the latest.
        clone(source_repo, local_repository=output, download_all=False)
        source_hash = source_repo.images[tag_or_hash].image_hash
        output.images.by_hash(source_hash).checkout()
        provenance = {
            "type": "FROM",
            "source_namespace": source_repo.namespace,
            "source": source_repo.repository,
            "source_hash": source_hash,
        }
    else:
        # FROM EMPTY AS repository -- initializes an empty repository (say to create a table or import
        # the results of a previous stage in a multistage build.
        # In this case, if AS repository has been specified, it's already been initialized. If not, this command
        # literally does nothing
        if not output_node:
            raise SplitfileError(
                "FROM EMPTY without AS (repository) does nothing!")
    return output, provenance
Example #12
0
def test_lq_qual_filtering(local_engine_empty, unprivileged_pg_repo,
                           clean_minio, test_case):
    # Test that LQ prunes the object list based on quals
    # We can't really see that directly, so we check to see which objects it tries to download.
    _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo)

    pg_repo_local = clone(unprivileged_pg_repo, download_all=False)
    pg_repo_local.images["latest"].checkout(layered=True)
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0

    query, expected, object_mask = test_case
    required_objects = pg_repo_local.head.get_table("fruits").objects

    assert len(required_objects) == 5
    assert required_objects == [
        # Initial fragment
        "of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5",
        # INS (3, mayonnaise)
        "of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b",
        # DEL (1, apple)
        "o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d",
        # UPS (2, guitar), replaces (2, orange)
        "o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc",
        # INS (4, kumquat)
        "oc27ee277aff108525a2df043d9efdaa1c3e26a4949a6cf6b53ee0c889c8559",
    ]

    expected_objects = [o for o, m in zip(required_objects, object_mask) if m]

    assert pg_repo_local.run_sql(query) == expected
    used_objects = pg_repo_local.objects.get_downloaded_objects()
    assert set(expected_objects) == set(used_objects)
Example #13
0
def test_push_own_delete_own_different_namespaces(local_engine_empty,
                                                  readonly_pg_repo):
    # Same as previous but we clone the read-only repo and push to our own namespace
    # to check that the objects we push get their namespaces rewritten to be the unprivileged user, not test.
    destination = clone(readonly_pg_repo)

    destination.images["latest"].checkout()
    destination.run_sql(
        """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""")
    destination.commit()

    remote_destination = Repository.from_template(
        readonly_pg_repo,
        namespace=readonly_pg_repo.engine.conn_params["SG_NAMESPACE"],
        engine=readonly_pg_repo.engine,
    )
    destination.upstream = remote_destination

    destination.push(handler="S3")

    object_id = destination.head.get_table("fruits").objects[-1]
    assert (remote_destination.objects.get_object_meta([object_id
                                                        ])[object_id].namespace
            == readonly_pg_repo.engine.conn_params["SG_NAMESPACE"])

    # Test we can delete our own repo once we've pushed it
    remote_destination.delete(uncheckout=False)
    assert len(remote_destination.images()) == 0
Example #14
0
def test_lq_remote(local_engine_empty, pg_repo_remote):
    # Test layered querying works when we initialize it on a cloned repo that doesn't have any
    # cached objects (all are on the remote).

    # 1 patch on top of fruits, 1 patch on top of vegetables
    prepare_lq_repo(pg_repo_remote, commit_after_every=False, include_pk=True)
    pg_repo_local = clone(pg_repo_remote, download_all=False)
    _test_lazy_lq_checkout(pg_repo_local)
Example #15
0
def test_push_others(local_engine_empty, readonly_pg_repo):
    destination = clone(readonly_pg_repo)
    destination.images["latest"].checkout()
    destination.run_sql(
        """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""")
    destination.commit()

    with pytest.raises(ProgrammingError) as e:
        destination.push(remote_repository=readonly_pg_repo, handler="S3")
    assert "You do not have access to this namespace!" in str(e.value)
Example #16
0
def _prepare_fully_remote_repo(local_engine_empty, pg_repo_remote_registry):
    # Setup: same as external, with an extra patch on top of the fruits table.
    pg_repo_local = clone(pg_repo_remote_registry)
    pg_repo_local.images["latest"].checkout()
    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'kumquat')")
    pg_repo_local.commit()
    pg_repo_local.push(handler="S3", handler_options={})
    pg_repo_local.delete()
    pg_repo_local.objects.cleanup()
    pg_repo_local.commit_engines()
Example #17
0
def test_pulls_with_lazy_object_downloads(local_engine_empty, pg_repo_remote):
    clone(pg_repo_remote, local_repository=PG_MNT, download_all=False)
    # Make sure we haven't downloaded anything until checkout
    assert not PG_MNT.objects.get_downloaded_objects()

    remote_head = pg_repo_remote.head

    PG_MNT.images.by_hash(remote_head.image_hash).checkout()
    assert (len(PG_MNT.objects.get_downloaded_objects()) == 2
            )  # Original fruits and vegetables tables.
    assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted(
        PG_MNT.objects.get_all_objects())

    # In the meantime, make two branches off of origin (a total of 3 commits)
    pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    left = pg_repo_remote.commit()

    remote_head.checkout()
    pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mustard')")
    right = pg_repo_remote.commit()

    # Pull from upstream.
    PG_MNT.pull(download_all=False)
    # Make sure we have the pointers to the three versions of the fruits table + the original vegetables
    assert len(PG_MNT.objects.get_all_objects()) == 4

    # Also make sure still only have the objects with the original fruits + vegetables tables
    assert len(PG_MNT.objects.get_downloaded_objects()) == 2

    # Check out left commit: since it only depends on the root, we should download just the new version of fruits.
    PG_MNT.images.by_hash(left.image_hash).checkout()

    assert (len(PG_MNT.objects.get_downloaded_objects()) == 3
            )  # now have 2 versions of fruits + 1 vegetables

    PG_MNT.images.by_hash(right.image_hash).checkout()
    assert (len(PG_MNT.objects.get_downloaded_objects()) == 4
            )  # now have 2 versions of fruits + 1 vegetables
    assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted(
        PG_MNT.objects.get_all_objects())
Example #18
0
def test_lq_external(local_engine_empty, unprivileged_pg_repo,
                     pg_repo_remote_registry, clean_minio):
    # Test layered querying works when we initialize it on a cloned repo that doesn't have any
    # cached objects (all are on S3 or other external location).

    pg_repo_local = clone(unprivileged_pg_repo)
    pg_repo_local.images["latest"].checkout()
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)

    # Setup: upstream has the same repository as in the previous test but with no cached objects (all are external).
    # In addition, we check that LQ works against an unprivileged upstream (where we don't actually have
    # admin access).
    pg_repo_local.push(unprivileged_pg_repo, handler="S3", handler_options={})
    pg_repo_local.delete()
    pg_repo_local.objects.cleanup()

    assert len(pg_repo_local.objects.get_all_objects()) == 0
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0
    assert len(pg_repo_remote_registry.objects.get_all_objects()) == 6

    # Proceed as per the previous test
    pg_repo_local = clone(unprivileged_pg_repo, download_all=False)
    _test_lazy_lq_checkout(pg_repo_local)
Example #19
0
def test_import_updating_splitfile_with_uploading(local_engine_empty,
                                                  remote_engine,
                                                  pg_repo_remote):
    execute_commands(load_splitfile("import_and_update.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head

    assert len(OUTPUT.objects.get_all_objects()
               ) == 4  # Two original tables + two updates

    # Push with upload. Have to specify the remote repo.
    remote_output = Repository(OUTPUT.namespace, OUTPUT.repository,
                               remote_engine)
    OUTPUT.push(remote_output, handler="S3", handler_options={})
    # Unmount everything locally and cleanup
    OUTPUT.delete()

    # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object
    # manager for the engine (maybe should inject it into local_engine/remote_engine instead)
    OUTPUT.objects.cleanup()
    assert not OUTPUT.objects.get_all_objects()

    clone(OUTPUT.to_schema(), download_all=False)

    assert not OUTPUT.objects.get_downloaded_objects()
    existing_objects = list(OUTPUT.objects.get_all_objects())
    assert len(existing_objects) == 4  # Two original tables + two updates
    # Only 2 objects are stored externally (the other two have been on the remote the whole time)
    assert len(
        OUTPUT.objects.get_external_object_locations(existing_objects)) == 2

    head.checkout()
    assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
Example #20
0
def test_lq_single_non_snap_object(local_engine_empty, unprivileged_pg_repo, clean_minio):
    # The object produced by
    # "DELETE FROM vegetables WHERE vegetable_id = 1;INSERT INTO vegetables VALUES (3, 'celery')"
    # has a deletion and an insertion. Check that an LQ that only uses that object
    # doesn't return the extra upserted/deleted flag column.

    _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo)

    pg_repo_local = clone(unprivileged_pg_repo, download_all=False)
    pg_repo_local.images["latest"].checkout(layered=True)

    assert pg_repo_local.run_sql(
        "SELECT * FROM vegetables WHERE vegetable_id = 3 AND name = 'celery'"
    ) == [(3, "celery")]
    used_objects = pg_repo_local.objects.get_downloaded_objects()
    assert len(used_objects) == 1
    def init_repo(self, repo_info: RepoInfo) -> Repository:
        repo = Repository(namespace=repo_info.namespace,
                          repository=repo_info.repository)

        if not repository_exists(repo):
            self.logger.info("Creating repo {}/{}...".format(
                repo.namespace, repo.repository))
            repo.init()

        if repo_info.remote_name:
            remote = Repository.from_template(repo,
                                              engine=get_engine(
                                                  repo_info.remote_name))
            cloned_repo = clone(
                remote,
                local_repository=repo,
                download_all=False,
                overwrite_objects=True,
                overwrite_tags=True,
            )

        return repo
    def read(self, location: str) -> Result:
        new = self.copy()
        new.location = location
        try:

            repo = Repository(namespace=new.repo_info.namespace,
                              repository=new.repo_info.repository)
            remote = Repository.from_template(repo,
                                              engine=get_engine(
                                                  new.repo_info.remote_name,
                                                  autocommit=True))

            cloned_repo = clone(
                remote,
                local_repository=repo,
                download_all=True,
                overwrite_objects=True,
                overwrite_tags=True,
                single_image=new.repo_info.tag,
            )
            data = sql_to_df(f"SELECT * FROM {new.repo_info.table}",
                             repository=cloned_repo,
                             use_lq=self.layer_query)

            if self.schema is not None:
                errors = self.schema.validate(data)
                if errors:
                    raise SchemaValidationError(errors)

            new.value = data
        except Exception as exc:
            self.logger.exception(
                "Unexpected error while reading from result handler: {}".
                format(repr(exc)))
            raise exc

        return new
Example #23
0
def test_bloom_reindex_push(local_engine_empty, unprivileged_pg_repo,
                            clean_minio):
    _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo)
    pg_repo_local = clone(unprivileged_pg_repo, download_all=False)

    # Do a reindex on the local engine and push the dataset back out.
    fruits = pg_repo_local.images["latest"].get_table("fruits")

    # The repo used for LQ tests has 2 objects that overwrite data, so we ignore those.
    reindexed = fruits.reindex(
        extra_indexes={"bloom": {
            "name": {
                "probability": 0.01
            }
        }},
        raise_on_patch_objects=False)
    pg_repo_local.commit_engines()

    # Push back out overwriting object metadata
    pg_repo_local.push(overwrite_objects=True, single_image="latest")

    # Check the index was written to the registry.
    assert ("bloom" in unprivileged_pg_repo.objects.get_object_meta(reindexed)[
        reindexed[0]].object_index)
def test_push_upload_error(local_engine_empty, unprivileged_pg_repo,
                           pg_repo_remote_registry, clean_minio, interrupted):
    clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False)
    PG_MNT.images["latest"].checkout()
    PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    PG_MNT.run_sql("INSERT INTO vegetables VALUES (3, 'cucumber')")
    head = PG_MNT.commit()

    # If the upload fails for whatever reason (e.g. Minio is inaccessible or the upload was aborted),
    # the whole push fails rather than leaving the registry in an inconsistent state.
    with patch.dict(
            "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS",
        {"S3": _flaky_handler(incomplete=interrupted)},
    ):
        with pytest.raises(Exception) as e:
            PG_MNT.push(remote_repository=unprivileged_pg_repo,
                        handler="S3",
                        handler_options={})

    assert head not in unprivileged_pg_repo.images
    # Only the two original tables from the original image upstream
    assert (pg_repo_remote_registry.engine.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.tables",
        return_shape=ResultShape.ONE_ONE) == 2)

    # Registry had 2 objects before the upload -- if we interrupted the upload,
    # we only managed to upload the first object that was registered (even if the image
    # wasn't).

    expected_object_count = 3 if interrupted else 2

    assert len(pg_repo_remote_registry.objects.get_all_objects()
               ) == expected_object_count

    # Two new objects not registered remotely since the upload failed
    assert (local_engine_empty.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_locations",
        return_shape=ResultShape.ONE_ONE,
    ) == expected_object_count)
    assert (pg_repo_remote_registry.engine.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_locations",
        return_shape=ResultShape.ONE_ONE,
    ) == expected_object_count)

    # Now do the push normally and check the image exists upstream.
    PG_MNT.push(remote_repository=unprivileged_pg_repo,
                handler="S3",
                handler_options={})

    assert any(i.image_hash == head.image_hash
               for i in unprivileged_pg_repo.images)

    assert len(pg_repo_remote_registry.objects.get_all_objects()) == 4

    assert (local_engine_empty.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_locations",
        return_shape=ResultShape.ONE_ONE,
    ) == 4)
    assert (pg_repo_remote_registry.engine.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_locations",
        return_shape=ResultShape.ONE_ONE,
    ) == 4)
Example #25
0
def test_pull_tag_overwriting(local_engine_empty, pg_repo_remote):
    head = pg_repo_remote.head
    head_1 = _add_image_to_repo(pg_repo_remote)

    head.tag("tag_1")
    head_1.tag("tag_2")
    head_1.tag("tag_3")
    pg_repo_remote.commit_engines()

    # Clone a single image
    clone(
        pg_repo_remote,
        local_repository=PG_MNT,
        single_image=head.image_hash[:12],
    )
    assert len(PG_MNT.images()) == 1
    assert PG_MNT.images()[0] == head
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None

    # Clone again, check nothing has changed.
    clone(
        pg_repo_remote,
        local_repository=PG_MNT,
        single_image=head.image_hash[:12],
    )
    assert len(PG_MNT.images()) == 1
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None

    # Pull the remainder of the repo
    PG_MNT.pull(single_image=head_1.image_hash)
    assert len(PG_MNT.images()) == 2
    assert PG_MNT.images["tag_2"] == head_1

    # Now update the tag on the remote
    head.tag("tag_2")
    pg_repo_remote.commit_engines()

    # Clone head again, check tag_2 wasn't overwritten (is still pointing to head_1)
    clone(
        pg_repo_remote,
        local_repository=PG_MNT,
        single_image=head.image_hash[:12],
    )
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images["tag_2"] == head_1
    assert PG_MNT.images["tag_3"] == head_1

    # Clone head again, this time overwriting the tag
    clone(
        pg_repo_remote,
        local_repository=PG_MNT,
        single_image=head.image_hash[:12],
        overwrite_tags=True,
    )
    assert len(PG_MNT.images()) == 2
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images["tag_2"] == head
    assert PG_MNT.images["tag_3"] == head_1

    # Update tag_3 to point to head as well
    head.tag("tag_3")
    pg_repo_remote.commit_engines()

    # Pull repo, check tag_3 hasn't moved.
    PG_MNT.pull()
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images["tag_2"] == head
    assert PG_MNT.images["tag_3"] == head_1

    # Pull again overwriting all tags, check tags have moved.
    PG_MNT.pull(overwrite_tags=True)
    assert PG_MNT.images["tag_1"] == head
    assert PG_MNT.images["tag_2"] == head
    assert PG_MNT.images["tag_3"] == head
def test_s3_push_pull(local_engine_empty, unprivileged_pg_repo,
                      pg_repo_remote_registry, clean_minio):
    # Test pushing/pulling when the objects are uploaded to a remote storage instead of to the actual remote DB.

    # In the beginning, the registry has two objects, all remote
    objects = pg_repo_remote_registry.objects.get_all_objects()
    assert len(
        unprivileged_pg_repo.objects.get_external_object_locations(
            list(objects))) == 2
    assert len(objects) == 2

    clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False)
    # Add a couple of commits, this time on the cloned copy.
    head = PG_MNT.images["latest"]
    head.checkout()
    PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    left = PG_MNT.commit()
    head.checkout()
    PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mustard')")
    right = PG_MNT.commit()

    # Push to origin, but this time upload the actual objects instead.
    PG_MNT.push(remote_repository=unprivileged_pg_repo,
                handler="S3",
                handler_options={})

    # Check that the actual objects don't exist on the remote but are instead registered with an URL.
    # All the objects on pgcache were registered remotely
    objects = pg_repo_remote_registry.objects.get_all_objects()
    local_objects = PG_MNT.objects.get_all_objects()
    assert all(o in objects for o in local_objects)
    # Two new non-local objects in the local engine, both registered as non-local on the remote engine.
    ext_objects_orig = PG_MNT.objects.get_external_object_locations(
        list(objects))
    ext_objects_pull = unprivileged_pg_repo.objects.get_external_object_locations(
        list(objects))
    assert len(ext_objects_orig) == 4
    assert all(e in ext_objects_pull for e in ext_objects_orig)

    # Destroy the pulled mountpoint and recreate it again.
    assert len(PG_MNT.objects.get_downloaded_objects()) == 4
    PG_MNT.delete()
    # Make sure we don't have any leftover physical objects.
    PG_MNT.objects.cleanup()
    assert len(PG_MNT.objects.get_downloaded_objects()) == 0

    clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False)

    # Proceed as per the lazy checkout tests to make sure we don't download more than required.
    # Make sure we still haven't downloaded anything.
    assert len(PG_MNT.objects.get_downloaded_objects()) == 0

    # Check out left commit: since it only depends on the root, we should download just the new version of fruits.
    left.checkout()
    assert (len(PG_MNT.objects.get_downloaded_objects()) == 3
            )  # now have 2 versions of fruits + 1 vegetables

    right.checkout()
    assert len(PG_MNT.objects.get_downloaded_objects()) == 4
    # Only now we actually have all the objects materialized.
    assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted(
        PG_MNT.objects.get_all_objects())
Example #27
0
def test_pull_public(local_engine_empty, readonly_pg_repo):
    clone(readonly_pg_repo)