def test_push_own_delete_own(local_engine_empty, unprivileged_pg_repo): destination = Repository.from_template(unprivileged_pg_repo, engine=local_engine_empty) clone(unprivileged_pg_repo, local_repository=destination) destination.images["latest"].checkout() destination.run_sql( """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""") destination.commit() # Test we can push to our namespace -- can't upload the object to the splitgraph_meta since we can't create # tables there remote_destination = Repository.from_template( destination, namespace=unprivileged_pg_repo.engine.conn_params["SG_NAMESPACE"], engine=unprivileged_pg_repo.engine, ) destination.upstream = remote_destination destination.push(handler="S3") # Test we can delete a single image from our own repo assert len(remote_destination.images()) == 3 remote_destination.images.delete([destination.images["latest"].image_hash]) assert len(remote_destination.images()) == 2 # Test we can delete our own repo once we've pushed it remote_destination.delete() assert len(remote_destination.images()) == 0
def readonly_pg_repo(unprivileged_remote_engine, pg_repo_remote_registry): target = Repository.from_template(pg_repo_remote_registry, namespace=READONLY_NAMESPACE) clone(pg_repo_remote_registry, target) pg_repo_remote_registry.delete(uncheckout=False) pg_repo_remote_registry.engine.run_sql( "UPDATE splitgraph_meta.objects SET namespace=%s WHERE namespace=%s", (READONLY_NAMESPACE, REMOTE_NAMESPACE), ) pg_repo_remote_registry.engine.commit() yield Repository.from_template(target, engine=unprivileged_remote_engine)
def unprivileged_pg_repo(unprivileged_remote_engine, pg_repo_remote_registry): """Like pg_repo_remote_registry but accessed as an unprivileged user that can't access splitgraph_meta directly and has to use splitgraph_api. If access to splitgraph_meta is required, the test can use both fixtures and do e.g. pg_repo_remote_registry.objects.get_all_objects()""" yield Repository.from_template(pg_repo_remote_registry, engine=unprivileged_remote_engine)
def test_push_own_delete_own_different_namespaces(local_engine_empty, readonly_pg_repo): # Same as previous but we clone the read-only repo and push to our own namespace # to check that the objects we push get their namespaces rewritten to be the unprivileged user, not test. destination = clone(readonly_pg_repo) destination.images["latest"].checkout() destination.run_sql( """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""") destination.commit() remote_destination = Repository.from_template( readonly_pg_repo, namespace=readonly_pg_repo.engine.conn_params["SG_NAMESPACE"], engine=readonly_pg_repo.engine, ) destination.upstream = remote_destination destination.push(handler="S3") object_id = destination.head.get_table("fruits").objects[-1] assert (remote_destination.objects.get_object_meta([object_id ])[object_id].namespace == readonly_pg_repo.engine.conn_params["SG_NAMESPACE"]) # Test we can delete our own repo once we've pushed it remote_destination.delete(uncheckout=False) assert len(remote_destination.images()) == 0
def test_bloom_reindex_remote(local_engine_empty, unprivileged_pg_repo, clean_minio): _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo) # Do a reindex using our local engine to query the object and the remote engine # to write metadata to. repo = Repository.from_template(unprivileged_pg_repo, object_engine=local_engine_empty) fruits = repo.images["latest"].get_table("fruits") # The repo used for LQ tests has 2 objects that overwrite data, so we ignore those. reindexed = fruits.reindex( extra_indexes={"bloom": { "name": { "probability": 0.01 } }}, raise_on_patch_objects=False) repo.commit_engines() assert len(reindexed) == 3 assert set(repo.objects.get_downloaded_objects()) == set(reindexed) # Check the index was written to the remote metadata engine. assert ("bloom" in unprivileged_pg_repo.objects.get_object_meta(reindexed)[ reindexed[0]].object_index)
def test_push_single_image(pg_repo_local, remote_engine): original_head = pg_repo_local.head _add_image_to_repo(pg_repo_local) remote_repo = Repository.from_template(pg_repo_local, engine=remote_engine) assert len(remote_repo.images()) == 0 assert len(remote_repo.objects.get_all_objects()) == 0 pg_repo_local.push(remote_repository=remote_repo, single_image=original_head.image_hash) assert len(remote_repo.images()) == 1 assert len(remote_repo.objects.get_all_objects()) == 2 # Try pushing the same image again pg_repo_local.push(remote_repository=remote_repo, single_image=original_head.image_hash) assert len(remote_repo.images()) == 1 assert len(remote_repo.objects.get_all_objects()) == 2 # Test we can check the repo out on the remote. remote_repo.images[original_head.image_hash].checkout() # Push the rest pg_repo_local.push(remote_repo) assert len(remote_repo.images()) == 3 assert len(remote_repo.objects.get_all_objects()) == 3
def make_pg_repo(engine, repository=None): repository = repository or Repository("test", "pg_mount") repository = Repository.from_template(repository, engine=engine) repository.init() repository.run_sql(PG_DATA) repository.commit() return repository
def clone_c(remote_repository_or_image, local_repository, remote, download_all, overwrite_object_meta, tags): """ Clone a remote Splitgraph repository/image into a local one. The lookup path for the repository is governed by the ``SG_REPO_LOOKUP`` and ``SG_REPO_LOOKUP_OVERRIDE`` config parameters and can be overridden by the command line ``--remote`` option. """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine from splitgraph.core.repository import clone remote_repository, image = remote_repository_or_image # If the user passed in a remote, we can inject that into the repository spec. # Otherwise, we have to turn the repository into a string and let clone() look up the # actual engine the repository lives on. if remote: remote_repository = Repository.from_template(remote_repository, engine=get_engine(remote)) else: remote_repository = remote_repository.to_schema() clone( remote_repository, local_repository=local_repository, download_all=download_all, single_image=image, overwrite_objects=overwrite_object_meta, overwrite_tags=tags, )
def exists(self, location: str, **kwargs: Any) -> bool: """ Checks whether the target result exists in the file system. Does not validate whether the result is `valid`, only that it is present. Args: - location (str): Location of the result in the specific result target. Will check whether the provided location exists - **kwargs (Any): string format arguments for `location` Returns: - bool: whether or not the target result exists """ try: repo_info = parse_repo(location) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name, autocommit=True)) table_exists_at(remote, repo_info.table) return self.client.get_object(Bucket=self.bucket, Key=location.format(**kwargs)) except Exception as exc: self.logger.exception( "Unexpected error while reading from Splitgraph: {}".format( repr(exc))) raise
def dependents_c(image_spec, source_on, dependents_on): """ List images that were created from an image. This is the inverse of the sgr provenance command. It will list all images that were created using a Splitfile that imported data from this image. By default, this will look at images on the local engine. The engine can be overridden with --source-on and --dependents-on. For example: sgr dependents --source-on data.splitgraph.com --dependents-on LOCAL noaa/climate:latest will show all images on the local engine that derived data from `noaa/climate:latest` on the Splitgraph registry. """ from splitgraph.engine import get_engine from splitgraph.core.repository import Repository source_engine = get_engine(source_on) if source_on else get_engine() repository, image = image_spec repository = Repository.from_template(repository, engine=source_engine) image = repository.images[image] target_engine = get_engine( dependents_on) if dependents_on else get_engine() result = image.provenance(reverse=True, engine=target_engine) click.echo("%s:%s is depended on by:" % (str(repository), image.image_hash)) click.echo("\n".join("%s:%s" % rs for rs in result))
def test_engine_autocommit(local_engine_empty): conn_params = _prepare_engine_config(CONFIG) engine = PostgresEngine(conn_params=conn_params, name="test_engine", autocommit=True) repo = Repository("test", "repo", engine=engine) repo.init() repo.engine.rollback() assert repository_exists(Repository.from_template(repo, engine=local_engine_empty))
def test_pull_push(local_engine_empty, pg_repo_remote): runner = CliRunner() pg_repo_local = Repository.from_template(pg_repo_remote, engine=local_engine_empty) # Clone the base 0000.. image first to check single-image clones assert len(pg_repo_local.images()) == 0 result = runner.invoke(clone_c, [str(pg_repo_local) + ":" + "00000000"]) assert result.exit_code == 0 assert len(pg_repo_local.images()) == 1 assert repository_exists(pg_repo_local) # Clone the rest of the repo result = runner.invoke(clone_c, [str(pg_repo_local)]) assert result.exit_code == 0 assert len(pg_repo_local.images()) == 2 pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") remote_engine_head = pg_repo_remote.commit() # Pull the new image result = runner.invoke( pull_c, [str(pg_repo_local) + ":" + remote_engine_head.image_hash[:10]]) assert result.exit_code == 0 assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 assert len(pg_repo_local.images()) == 3 # Pull the whole repo (should be no changes) result = runner.invoke(pull_c, [str(pg_repo_local)]) assert result.exit_code == 0 assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 assert len(pg_repo_local.images()) == 3 # Pull repo downloading everything result = runner.invoke(pull_c, [str(pg_repo_local), "--download-all"]) assert result.exit_code == 0 assert len(pg_repo_local.objects.get_downloaded_objects()) == 3 pg_repo_local.images.by_hash(remote_engine_head.image_hash).checkout() pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'mustard')") local_head = pg_repo_local.commit() assert local_head.image_hash not in list(pg_repo_remote.images) # Push out the single new image first result = runner.invoke( push_c, [str(pg_repo_local) + ":" + local_head.image_hash[:10], "-h", "DB"]) assert result.exit_code == 0 assert len(pg_repo_remote.images()) == 4 # Push out the whole repo result = runner.invoke(push_c, [str(pg_repo_local), "-h", "DB"]) assert result.exit_code == 0 assert pg_repo_local.head.get_table("fruits")
def test_multiengine_flow( local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio ): # Test querying by using the remote engine as a metadata store and the local engine as an object store. _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo) pg_repo_local = Repository.from_template(unprivileged_pg_repo, object_engine=local_engine_empty) # Checkout currently requires the engine connection to be privileged # (since it does manage_audit_triggers()) -- so we bypass all bookkeeping and call the # actual LQ routine directly. local_engine_empty.create_schema(pg_repo_local.to_schema()) pg_repo_local.images["latest"]._lq_checkout() # Take one of the test cases we ran in test_lq_qual_filtering that exercises index lookups, # LQs, object downloads and make sure that the correct engines are used result = pg_repo_local.run_sql("SELECT * FROM fruits WHERE fruit_id >= 3 ORDER BY fruit_id") assert result == [(3, "mayonnaise", 1, _DT), (4, "kumquat", 1, _DT)] # Test cache occupancy calculations work only using the object engine _assert_cache_occupancy(pg_repo_local.objects, 2) # 2 objects downloaded from S3 to satisfy the query -- on the local engine assert ( local_engine_empty.run_sql( "SELECT COUNT(1) FROM splitgraph_meta.object_cache_status", return_shape=ResultShape.ONE_ONE, ) == 2 ) assert ( len(set(local_engine_empty.get_all_tables("splitgraph_meta")).difference(set(META_TABLES))) == 2 ) # Test the local engine doesn't actually have any metadata stored on it. for table in META_TABLES: if table not in ("object_cache_status", "object_cache_occupancy", "version"): assert ( local_engine_empty.run_sql( "SELECT COUNT(1) FROM splitgraph_meta." + table, return_shape=ResultShape.ONE_ONE, ) == 0 ) # remote engine untouched assert ( pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(1) FROM splitgraph_meta.object_cache_status", return_shape=ResultShape.ONE_ONE, ) == 0 )
def _make_push_target(repository, remote): from splitgraph.core.repository import Repository from splitgraph.engine import get_engine try: namespace = get_from_subsection(CONFIG, "remotes", remote, "SG_NAMESPACE") except KeyError: namespace = None remote_repository = Repository.from_template(repository, namespace=namespace, engine=get_engine(remote)) return remote_repository
def run(self, workspaces: Dict[str, Workspace] = None, sgr_tags: Dict[str, List[str]] = None, **kwargs: Any): """ Args: Returns: """ repo_infos = dict((name, parse_repo(workspace['repo_uri'])) for (name, workspace) in workspaces.items()) repos = dict((name, Repository(namespace=repo_info.namespace, repository=repo_info.repository)) for (name, repo_info) in repo_infos.items()) repos_with_new_images = dict( (name, repo) for (name, repo) in repos.items() if repo.head and repo.head.image_hash != workspaces[name]['image_hash']) for name, repo in repos_with_new_images.items(): repo_tags = sgr_tags[name] if sgr_tags and name in sgr_tags else [] for tag in repo_tags: repo.head.tag(tag) # Push all repos. We don't know for sure that it shouldn't be pushed for name, repo in repos.items(): remote_name = repo_infos[name].remote_name if not remote_name: self.logger.warn( f'No remote_name specified. Not pushing {name}.') continue remote = Repository.from_template(repo, engine=get_engine(remote_name)) repo.push( remote, handler="S3", handler_options={"threads": 8}, overwrite_objects=True, overwrite_tags=True, ) self.logger.info(f'Pushed {name} to {remote_name}') tagged_repo_uris = dict( (name, workspaces[name]['repo_uri']) for (name, repo) in repos_with_new_images.items()) return tagged_repo_uris
def _determine_push_target(repository, remote_repository, remote): """ Create the remote Repository object we're pushing to based on all the parameters we've been passed The reason for this behaviour is to streamline out-of-the-box Splitgraph setups where data.splitgraph.com is the only registered engine. In that case: * sgr push repo: will push to myself/repo on data.splitgraph.com with S3 uploading (user's namespace). * sgr push noaa/climate: will push to myself/climate * sgr push noaa/climate noaa/climate: will explicitly push to noaa/climate (assuming the user can write to that repository). If the user registers another registry at splitgraph.mycompany.com, then they will be able to do: * sgr push noaa/climate -r splitgraph.mycompany.com: will push to noaa/climate :param repository: Local Repository, required. :param remote_repository: remote Repository (without the remote engine), optional. :param remote: Name of the remote engine/registry, optional. :return: """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine if remote_repository and remote: remote_repository = Repository.from_template(remote_repository, engine=get_engine(remote)) elif remote: remote_repository = _make_push_target(repository, remote) elif remote_repository: remote_repository = Repository.from_template( remote_repository, engine=get_engine(_get_default_remote())) else: remote_repository = repository.upstream or _make_push_target( repository, _get_default_remote()) return remote_repository
def init_repo(self, repo_info: RepoInfo) -> Repository: repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) if not repository_exists(repo): self.logger.info("Creating repo {}/{}...".format( repo.namespace, repo.repository)) repo.init() if repo_info.remote_name: remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name)) cloned_repo = clone( remote, local_repository=repo, download_all=False, overwrite_objects=True, overwrite_tags=True, ) return repo
def prune_c(repository, yes): """ Cleanup dangling images from a repository. This includes images not pointed to by any tags (or checked out) and those that aren't required by any of such images. Will ask for confirmation of the deletion, unless ``-y ``is passed. If ``-r`` (``--remote``) is passed, this will perform deletion on a remote Splitgraph engine (registered in the config) instead, assuming the user has write access to the remote repository. This does not delete any physical objects that the deleted repository/images depend on: use ``sgr cleanup`` to do that. """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine repository = Repository.from_template(repository, engine=get_engine()) all_images = set(image.image_hash for image in repository.images()) all_tagged_images = {i for i, t in repository.get_all_hashes_tags()} dangling_images = all_images.difference( repository.images.get_all_parent_images(all_tagged_images) ) if not dangling_images: click.echo("Nothing to do.") return click.echo("Images to be deleted:") click.echo("\n".join(sorted(dangling_images))) click.echo("Total: %d" % len(dangling_images)) if not yes: click.confirm("Continue? ", abort=True) repository.images.delete(dangling_images) repository.commit_engines() click.echo("Success.")
def read(self, location: str) -> Result: new = self.copy() new.location = location try: repo = Repository(namespace=new.repo_info.namespace, repository=new.repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( new.repo_info.remote_name, autocommit=True)) cloned_repo = clone( remote, local_repository=repo, download_all=True, overwrite_objects=True, overwrite_tags=True, single_image=new.repo_info.tag, ) data = sql_to_df(f"SELECT * FROM {new.repo_info.table}", repository=cloned_repo, use_lq=self.layer_query) if self.schema is not None: errors = self.schema.validate(data) if errors: raise SchemaValidationError(errors) new.value = data except Exception as exc: self.logger.exception( "Unexpected error while reading from result handler: {}". format(repr(exc))) raise exc return new
def write(self, value_: Any, **kwargs: Any) -> Result: """ Writes the result to a repository on Splitgraph Args: - value_ (Any): the value to write; will then be stored as the `value` attribute of the returned `Result` instance - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag` Returns: - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes """ if self.schema is not None: errors = self.schema.validate(value_) if errors: raise SchemaValidationError(errors) new = self.format(**kwargs) new.value = value_ repo_info = parse_repo(new.location) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name, autocommit=True)) assert isinstance(value_, pd.DataFrame) if not repository_exists(repo) and self.auto_init_repo: self.logger.info("Creating repo {}/{}...".format( repo.namespace, repo.repository)) repo.init() # TODO: Retrieve the repo from bedrock first self.logger.info("Starting to upload result to {}...".format( new.location)) with self.atomic(repo.engine): self.logger.info("checkout") img = repo.head img.checkout(force=True) self.logger.info("df to table") df_to_table(new.value, repository=repo, table=repo_info.table, if_exists='replace') self.logger.info("commit") new_img = repo.commit(comment=new.comment, chunk_size=10000) new_img.tag(repo_info.tag) # if (repo.diff(new.table, img, new_img)): if self.auto_push: self.logger.info("push") repo.push( remote, handler="S3", overwrite_objects=True, overwrite_tags=True, reupload_objects=True, ) self.logger.info("Finished uploading result to {}...".format( new.location)) return new
def upstream_c(repository, set_to, reset): """ Get or set the upstream for a repository. This shows the default repository used for pushes and pulls as well as allows to change it to a different remote engine and repository. The remote engine alias must exist in the config file. Examples: ``sgr upstream my/repo --set splitgraph.com username/repo`` Sets the upstream for ``my/repo`` to ``username/repo`` existing on the ``splitgraph.com`` engine ``sgr upstream my/repo --reset`` Removes the upstream for ``my/repo``. ``sgr upstream my/repo`` Shows the current upstream for ``my/repo``. """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine # surely there's a better way of finding out whether --set isn't specified if set_to != ("", None) and reset: raise click.BadParameter( "Only one of --set and --reset can be specified!") if reset: if repository.upstream: del repository.upstream click.echo("Deleted upstream for %s." % repository.to_schema()) else: click.echo("%s has no upstream to delete!" % repository.to_schema()) sys.exit(1) return if set_to == ("", None): upstream = repository.upstream if upstream: click.echo("%s is tracking %s:%s." % (repository.to_schema(), upstream.engine.name, upstream.to_schema())) else: click.echo("%s has no upstream." % repository.to_schema()) else: engine, remote_repo = set_to try: remote_repo = Repository.from_template(remote_repo, engine=get_engine(engine)) except KeyError: click.echo( "Remote engine '%s' does not exist in the configuration file!" % engine) sys.exit(1) repository.upstream = remote_repo click.echo("%s set to track %s:%s." % (repository.to_schema(), engine, remote_repo.to_schema()))
def get_upstream(self, repository: Repository): return Repository.from_template(repository, engine=get_engine('bedrock', autocommit=True))
def rm_c(image_spec, yes): """ Delete schemas, repositories or images. If the target of this command is a Postgres schema, this performs DROP SCHEMA CASCADE. If the target of this command is a Splitgraph repository, this deletes the repository and all of its history. If the target of this command is an image, this deletes the image and all of its children. In any case, this command will ask for confirmation of the deletion, unless ``-y`` is passed. If ``-r`` (``--remote``), is passed, this will perform deletion on a remote Splitgraph engine (registered in the config) instead, assuming the user has write access to the remote repository. This does not delete any physical objects that the deleted repository/images depend on: use ``sgr cleanup`` to do that. Examples: ``sgr rm temporary_schema`` Deletes ``temporary_schema`` from the local engine. ``sgr rm --remote data.splitgraph.com username/repo`` Deletes ``username/repo`` from the Splitgraph registry. ``sgr rm -y username/repo:old_branch`` Deletes the image pointed to by ``old_branch`` as well as all of its children (images created by a commit based on this image), as well as all of the tags that point to now deleted images, without asking for confirmation. Note this will not delete images that import tables from the deleted images via Splitfiles or indeed the physical objects containing the actual tables. """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine from splitgraph.core.engine import repository_exists engine = get_engine() repository, image = image_spec repository = Repository.from_template(repository, engine=engine) if not image: click.echo( ("Repository" if repository_exists(repository) else "Postgres schema") + " %s will be deleted." % repository.to_schema() ) if not yes: click.confirm("Continue? ", abort=True) # Don't try to "uncheckout" repositories on the registry/other remote engines repository.delete(uncheckout=engine.name == "LOCAL") repository.commit_engines() else: image = repository.images[image] images_to_delete = repository.images.get_all_child_images(image.image_hash) tags_to_delete = [t for i, t in repository.get_all_hashes_tags() if i in images_to_delete] click.echo("Images to be deleted:") click.echo("\n".join(sorted(images_to_delete))) click.echo("Total: %d" % len(images_to_delete)) click.echo("\nTags to be deleted:") click.echo("\n".join(sorted(tags_to_delete))) click.echo("Total: %d" % len(tags_to_delete)) if "HEAD" in tags_to_delete: # If we're deleting an image that we currently have checked out, # we need to make sure the rest of the metadata (e.g. current state of the audit table) is consistent, # it's better to disallow these deletions completely. raise CheckoutError( "Deletion will affect a checked-out image! Check out a different branch " "or do sgr checkout -u %s!" % repository.to_schema() ) if not yes: click.confirm("Continue? ", abort=True) repository.images.delete(images_to_delete) repository.commit_engines() click.echo("Success.")