Ejemplo n.º 1
0
def test_misc_mountpoint_management(pg_repo_local, mg_repo_local):
    runner = CliRunner()

    result = runner.invoke(status_c)
    assert str(pg_repo_local) in result.output
    assert str(mg_repo_local) in result.output

    # sgr rm -y test/pg_mount (no prompting)
    result = runner.invoke(rm_c, [str(mg_repo_local), "-y"])
    assert result.exit_code == 0
    assert not repository_exists(mg_repo_local)

    # sgr cleanup
    result = runner.invoke(cleanup_c)
    assert "Deleted 1 object" in result.output

    # sgr init
    result = runner.invoke(init_c, ["output"])
    assert "Initialized empty repository output" in result.output
    assert repository_exists(OUTPUT)

    # sgr mount with a file
    with tempfile.NamedTemporaryFile("w") as f:
        json.dump(
            {
                "stuff": {
                    "db": "origindb",
                    "coll": "stuff",
                    "schema": {"name": "text", "duration": "numeric", "happy": "boolean"},
                }
            },
            f,
        )
        f.flush()

        result = runner.invoke(
            mount_c,
            [
                "mongo_fdw",
                str(mg_repo_local),
                "-c",
                "originro:originpass@mongoorigin:27017",
                "-o",
                "@" + f.name,
            ],
        )
    assert result.exit_code == 0
    assert mg_repo_local.run_sql("SELECT duration from stuff WHERE name = 'James'") == [
        (Decimal(2),)
    ]
Ejemplo n.º 2
0
def _execute_from(
        node: Node,
        output: Repository) -> Tuple[Repository, Optional[ProvenanceLine]]:
    interesting_nodes = extract_nodes(node, ["repo_source", "repository"])
    repo_source = get_first_or_none(interesting_nodes, "repo_source")
    output_node = get_first_or_none(interesting_nodes, "repository")
    provenance: Optional[ProvenanceLine] = None

    if output_node:
        # AS (output) detected, change the current output repository to it.
        output = Repository.from_schema(output_node.match.group(0))
        logging.info("Changed output repository to %s" % str(output))

        # NB this destroys all data in the case where we ran some commands in the Splitfile and then
        # did FROM (...) without AS repository
        if repository_exists(output):
            logging.info("Clearing all output from %s" % str(output))
            output.delete()
    if not repository_exists(output):
        output.init()
    if repo_source:
        repository, tag_or_hash = parse_image_spec(repo_source)
        source_repo = lookup_repository(repository.to_schema(),
                                        include_local=True)

        if source_repo.engine.name == "LOCAL":
            # For local repositories, make sure to update them if they've an upstream
            if source_repo.upstream:
                source_repo.pull()

        # Get the target image hash from the source repo: otherwise, if the tag is, say, 'latest' and
        # the output has just had the base commit (000...) created in it, that commit will be the latest.
        clone(source_repo, local_repository=output, download_all=False)
        source_hash = source_repo.images[tag_or_hash].image_hash
        output.images.by_hash(source_hash).checkout()
        provenance = {
            "type": "FROM",
            "source_namespace": source_repo.namespace,
            "source": source_repo.repository,
            "source_hash": source_hash,
        }
    else:
        # FROM EMPTY AS repository -- initializes an empty repository (say to create a table or import
        # the results of a previous stage in a multistage build.
        # In this case, if AS repository has been specified, it's already been initialized. If not, this command
        # literally does nothing
        if not output_node:
            raise SplitfileError(
                "FROM EMPTY without AS (repository) does nothing!")
    return output, provenance
Ejemplo n.º 3
0
def prepare_new_image(
        repository: "Repository",
        hash_or_tag: Optional[str]) -> Tuple[Optional[Image], str]:
    new_image_hash = "{:064x}".format(getrandbits(256))
    if repository_exists(repository):
        # Clone the base image and delta compress against it
        base_image: Optional[
            Image] = repository.images[hash_or_tag] if hash_or_tag else None
        repository.images.add(parent_id=None,
                              image=new_image_hash,
                              comment="Singer tap ingestion")
        if base_image:
            repository.engine.run_sql(
                "INSERT INTO splitgraph_meta.tables "
                "(SELECT namespace, repository, %s, table_name, table_schema, object_ids "
                "FROM splitgraph_meta.tables "
                "WHERE namespace = %s AND repository = %s AND image_hash = %s)",
                (
                    new_image_hash,
                    repository.namespace,
                    repository.repository,
                    base_image.image_hash,
                ),
            )
    else:
        base_image = None
        repository.images.add(parent_id=None,
                              image=new_image_hash,
                              comment="Singer tap ingestion")
    return base_image, new_image_hash
Ejemplo n.º 4
0
def test_pull_download_error(local_engine_empty, unprivileged_pg_repo,
                             clean_minio, interrupted):
    # Same test backwards: if we're pulling and abort or fail the download, make sure we can
    # recover and retry pulling the repo.

    with patch.dict(
            "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS",
        {"S3": _flaky_handler(interrupted)},
    ):
        with pytest.raises(Exception) as e:
            clone(unprivileged_pg_repo,
                  local_repository=PG_MNT,
                  download_all=True)

    # Check that the pull succeeded (repository registered locally) but the objects
    # are just marked as external, not downloaded
    assert repository_exists(PG_MNT)
    assert len(PG_MNT.objects.get_all_objects()) == 2
    assert len(PG_MNT.objects.get_downloaded_objects()) == 1
    assert len(
        PG_MNT.objects.get_external_object_locations(
            PG_MNT.objects.get_all_objects())) == 2
    assert (PG_MNT.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status",
        return_shape=ResultShape.ONE_ONE,
    ) == 1)

    clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=True)
    assert len(PG_MNT.objects.get_all_objects()) == 2
    assert len(PG_MNT.objects.get_downloaded_objects()) == 2
    assert len(list(PG_MNT.images)) == 2
    assert (PG_MNT.run_sql(
        "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status",
        return_shape=ResultShape.ONE_ONE,
    ) == 2)
Ejemplo n.º 5
0
    def load(self,
             repository: "Repository",
             tables: Optional[TableInfo] = None) -> str:
        if not repository_exists(repository):
            repository.init()

        image_hash = "{:064x}".format(getrandbits(256))
        tmp_schema = "{:064x}".format(getrandbits(256))
        repository.images.add(
            parent_id=None,
            image=image_hash,
        )
        repository.object_engine.create_schema(tmp_schema)

        try:
            self._load(schema=tmp_schema, tables=tables)

            repository._commit(
                head=None,
                image_hash=image_hash,
                snap_only=True,
                chunk_size=100000,
                schema=tmp_schema,
            )
        finally:
            repository.object_engine.delete_schema(tmp_schema)
            repository.commit_engines()

        return image_hash
Ejemplo n.º 6
0
def test_engine_autocommit(local_engine_empty):
    conn_params = _prepare_engine_config(CONFIG)
    engine = PostgresEngine(conn_params=conn_params, name="test_engine", autocommit=True)

    repo = Repository("test", "repo", engine=engine)
    repo.init()

    repo.engine.rollback()
    assert repository_exists(Repository.from_template(repo, engine=local_engine_empty))
Ejemplo n.º 7
0
def test_rm_repositories(pg_repo_local, pg_repo_remote):
    runner = CliRunner()

    # sgr rm test/pg_mount, say "no"
    result = runner.invoke(rm_c, [str(pg_repo_local)], input="n\n")
    assert result.exit_code == 1
    assert "Repository test/pg_mount will be deleted" in result.output
    assert repository_exists(pg_repo_local)

    # sgr rm test/pg_mount, say "yes"
    result = runner.invoke(rm_c, [str(pg_repo_local)], input="y\n")
    assert result.exit_code == 0
    assert not repository_exists(pg_repo_local)

    # sgr rm test/pg_mount -r remote_engine
    result = runner.invoke(rm_c, [str(pg_repo_remote), "-r", "remote_engine"], input="y\n")
    assert result.exit_code == 0
    assert not repository_exists(pg_repo_remote)
Ejemplo n.º 8
0
def test_pull_push(local_engine_empty, pg_repo_remote):
    runner = CliRunner()
    pg_repo_local = Repository.from_template(pg_repo_remote,
                                             engine=local_engine_empty)

    # Clone the base 0000.. image first to check single-image clones
    assert len(pg_repo_local.images()) == 0
    result = runner.invoke(clone_c, [str(pg_repo_local) + ":" + "00000000"])
    assert result.exit_code == 0
    assert len(pg_repo_local.images()) == 1
    assert repository_exists(pg_repo_local)

    # Clone the rest of the repo
    result = runner.invoke(clone_c, [str(pg_repo_local)])
    assert result.exit_code == 0
    assert len(pg_repo_local.images()) == 2

    pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    remote_engine_head = pg_repo_remote.commit()

    # Pull the new image
    result = runner.invoke(
        pull_c,
        [str(pg_repo_local) + ":" + remote_engine_head.image_hash[:10]])
    assert result.exit_code == 0
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0
    assert len(pg_repo_local.images()) == 3

    # Pull the whole repo (should be no changes)
    result = runner.invoke(pull_c, [str(pg_repo_local)])
    assert result.exit_code == 0
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0
    assert len(pg_repo_local.images()) == 3

    # Pull repo downloading everything
    result = runner.invoke(pull_c, [str(pg_repo_local), "--download-all"])
    assert result.exit_code == 0
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 3

    pg_repo_local.images.by_hash(remote_engine_head.image_hash).checkout()

    pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'mustard')")
    local_head = pg_repo_local.commit()

    assert local_head.image_hash not in list(pg_repo_remote.images)

    # Push out the single new image first
    result = runner.invoke(
        push_c,
        [str(pg_repo_local) + ":" + local_head.image_hash[:10], "-h", "DB"])
    assert result.exit_code == 0
    assert len(pg_repo_remote.images()) == 4

    # Push out the whole repo
    result = runner.invoke(push_c, [str(pg_repo_local), "-h", "DB"])
    assert result.exit_code == 0
    assert pg_repo_local.head.get_table("fruits")
Ejemplo n.º 9
0
    def convert(self, value: str, param: Optional[Parameter],
                ctx: Optional[Context]) -> "Repository":
        from splitgraph.core.repository import Repository

        result = Repository.from_schema(value)
        if self.exists:
            from splitgraph.core.engine import repository_exists

            if not repository_exists(result):
                raise RepositoryNotFoundError("Unknown repository %s" % result)
        return result
Ejemplo n.º 10
0
    def by_tag(self, tag: str, raise_on_none: bool = True) -> Optional[Image]:
        """
        Returns an image with a given tag

        :param tag: Tag. 'latest' is a special case: it returns the most recent image in the repository.
        :param raise_on_none: Whether to raise an error or return None if the tag doesn't exist.
        """
        engine = self.engine
        if not repository_exists(self.repository):
            raise RepositoryNotFoundError("Unknown repository %s!" %
                                          str(self.repository))

        if tag == "latest":
            # Special case, return the latest commit from the repository.
            result = self.engine.run_sql(
                select(
                    "get_images",
                    ",".join(IMAGE_COLS),
                    schema=SPLITGRAPH_API_SCHEMA,
                    table_args="(%s,%s)",
                ) + SQL(" ORDER BY created DESC LIMIT 1"),
                (self.repository.namespace, self.repository.repository),
                return_shape=ResultShape.ONE_MANY,
            )
            if result is None:
                raise ImageNotFoundError("No images found in %s!" %
                                         self.repository.to_schema())
            return self._make_image(result)

        result = engine.run_sql(
            select(
                "get_tagged_images",
                "image_hash",
                "tag = %s",
                schema=SPLITGRAPH_API_SCHEMA,
                table_args="(%s,%s)",
            ),
            (self.repository.namespace, self.repository.repository, tag),
            return_shape=ResultShape.ONE_ONE,
        )
        if result is None:
            if raise_on_none:
                schema = self.repository.to_schema()
                if tag == "HEAD":
                    raise ImageNotFoundError(
                        'No current checked out revision found for %s. Check one out with "sgr '
                        'checkout %s:image_hash".' % (schema, schema))
                raise ImageNotFoundError("Tag %s not found in repository %s" %
                                         (tag, schema))
            return None
        return self.by_hash(result)
Ejemplo n.º 11
0
def generate_c(repository):
    """
    Generate a repository with some example data.

    :param repository: Repository to generate. Must not already exist.
    """
    from splitgraph.core.engine import repository_exists

    if repository_exists(repository):
        raise click.ClickException(
            "Repository %s already exists, use sgr rm to delete it!" % repository.to_schema()
        )

    repository.init()
    # Insert some data
    generate_table(repository, "demo", size=_DEMO_TABLE_SIZE)

    image = repository.commit()
    click.echo(
        "Generated %s:%s with %s rows, image hash %s."
        % (repository.to_schema(), "demo", _DEMO_TABLE_SIZE, image.image_hash[:12])
    )
Ejemplo n.º 12
0
    def init_repo(self, repo_info: RepoInfo) -> Repository:
        repo = Repository(namespace=repo_info.namespace,
                          repository=repo_info.repository)

        if not repository_exists(repo):
            self.logger.info("Creating repo {}/{}...".format(
                repo.namespace, repo.repository))
            repo.init()

        if repo_info.remote_name:
            remote = Repository.from_template(repo,
                                              engine=get_engine(
                                                  repo_info.remote_name))
            cloned_repo = clone(
                remote,
                local_repository=repo,
                download_all=False,
                overwrite_objects=True,
                overwrite_tags=True,
            )

        return repo
Ejemplo n.º 13
0
    def sync(
        self,
        repository: "Repository",
        image_hash: Optional[str],
        tables: Optional[TableInfo] = None,
    ) -> str:
        if not repository_exists(repository):
            repository.init()

        state = get_ingestion_state(repository, image_hash)
        image_hash = image_hash or "0" * 64
        repository.images[image_hash].checkout()

        try:
            new_state = self._sync(schema=repository.to_schema(),
                                   state=state,
                                   tables=tables)

            if new_state:
                # Write the new state to the table
                if not repository.object_engine.table_exists(
                        repository.to_schema(), INGESTION_STATE_TABLE):
                    repository.object_engine.create_table(
                        repository.to_schema(), INGESTION_STATE_TABLE,
                        INGESTION_STATE_SCHEMA)

                repository.run_sql(
                    SQL("INSERT INTO {} (timestamp, state) VALUES(now(), %s)").
                    format(Identifier(INGESTION_STATE_TABLE)),
                    (Json(new_state), ),
                )

            new_image = repository.commit()
        finally:
            repository.uncheckout()
            repository.commit_engines()

        return new_image.image_hash
Ejemplo n.º 14
0
    def convert(
        self, value: str, param: Optional[Parameter], ctx: Optional[Context]
    ) -> Tuple["Repository", Optional[Union["Image", str]]]:
        """
        Image specification must have the format [NAMESPACE/]REPOSITORY[:HASH_OR_TAG].

        The parser returns a tuple of (repository object, tag or hash).
        """
        from splitgraph.core.output import parse_repo_tag_or_hash

        repo, tag_or_hash = parse_repo_tag_or_hash(value, default=self.default)

        if self.get_image or self.repository_exists:
            # Check image/repo exists if we're asked (or if we need to produce
            # an actual Image object)
            from splitgraph.core.engine import repository_exists

            if not repository_exists(repo):
                raise RepositoryNotFoundError("Unknown repository %s" % repo)

        if tag_or_hash is not None and self.get_image:
            return repo, repo.images[tag_or_hash]
        else:
            return repo, tag_or_hash
Ejemplo n.º 15
0
def test_reindex_and_force_push(pg_repo_local, pg_repo_remote):
    runner = CliRunner(mix_stderr=False)

    result = runner.invoke(clone_c, [str(pg_repo_local)])
    assert result.exit_code == 0
    assert repository_exists(pg_repo_local)

    result = runner.invoke(
        reindex_c,
        [
            str(pg_repo_local) + ":latest", "fruits",
            '-i {"bloom": {"name": {"probability": 0.01}}}'
        ],
    )
    assert result.exit_code == 0
    assert "Reindexed 1 object" in result.output

    result = runner.invoke(push_c, [str(pg_repo_local) + ":latest", "-f"],
                           catch_exceptions=False)
    assert result.exit_code == 0

    obj = pg_repo_remote.images["latest"].get_table("fruits").objects[0]
    assert "bloom" in pg_repo_remote.objects.get_object_meta(
        [obj])[obj].object_index
Ejemplo n.º 16
0
def execute_commands(
    commands: str,
    params: Optional[Dict[str, str]] = None,
    output: Optional[Repository] = None,
    output_base: str = "0" * 32,
) -> None:
    """
    Executes a series of Splitfile commands.

    :param commands: A string with the raw Splitfile.
    :param params: A dictionary of parameters to be applied to the Splitfile (`${PARAM}` is replaced with the specified
        parameter value).
    :param output: Output repository to execute the Splitfile against.
    :param output_base: If not None, a revision that gets checked out for all Splitfile actions to be committed
        on top of it.
    """
    if params is None:
        params = {}
    if output and repository_exists(output) and output_base is not None:
        output.images.by_hash(output_base).checkout()
    # Use a random target schema if unspecified.
    output = output or Repository.from_schema("output_%0.2x" % getrandbits(16))

    # Don't initialize the output until a command writing to it asks us to
    # (otherwise we might have a FROM ... AS output_name change it).
    repo_created = False

    def _initialize_output(output):
        if not repository_exists(output):
            nonlocal repo_created
            output.init()
            repo_created = True

    from splitgraph.commandline.common import Color

    node_list = parse_commands(commands, params=params)

    # Record the internal structure of commands used to create the final image.
    provenance: List[ProvenanceLine] = []

    try:
        for i, node in enumerate(node_list):
            logging.info(
                Color.BOLD + "\nStep %d/%d : %s" %
                (i + 1, len(node_list), truncate_line(node.text, length=60)) +
                Color.END)
            if node.expr_name == "from":
                output, maybe_provenance_line = _execute_from(node, output)
                if maybe_provenance_line:
                    provenance.append(maybe_provenance_line)

            elif node.expr_name == "import":
                _initialize_output(output)
                provenance_line = _execute_import(node, output)
                provenance.append(provenance_line)

            elif node.expr_name == "sql" or node.expr_name == "sql_file":
                _initialize_output(output)
                provenance_line = _execute_sql(node, output)
                provenance.append(provenance_line)

            elif node.expr_name == "custom":
                _initialize_output(output)
                provenance_line = _execute_custom(node, output)
                provenance.append(provenance_line)

        final_image = output.head_strict
        final_image.set_provenance(provenance)
        get_engine().commit()
        logging.info("Successfully built %s:%s." %
                     (str(output), final_image.image_hash[:12]))

    except Exception:
        if repo_created and len(output.images()) == 1:
            # As a corner case, if we created a repository and there's been
            # a failure running the Splitfile (on the first command), we delete the dummy
            # 0000... image and the rest of the repository as part of cleanup.
            output.delete()
        get_engine().rollback()
        raise
Ejemplo n.º 17
0
    def write(self, value_: Any, **kwargs: Any) -> Result:
        """
        Writes the result to a repository on Splitgraph


        Args:
            - value_ (Any): the value to write; will then be stored as the `value` attribute
                of the returned `Result` instance
            - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag`

        Returns:
            - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes
        """

        cfg = patch_config(create_config_dict(), self.env or dict())
        engine = PostgresEngine(name='SplitgraphResult', conn_params=cfg)
        engine.initialize()
        repo = Repository(namespace=self.namespace, repository=self.repo_name, engine=engine)

        assert isinstance(value_, pd.DataFrame)
        assert engine.connected

        if not repository_exists(repo) and self.auto_init_repo:
            self.logger.info("Creating repo {}/{}...".format(repo.namespace, repo.repository))
            repo.init()

        # TODO: Retrieve the repo from bedrock first

        new = self.format(**kwargs)
        new.value = value_

        self.logger.info("Starting to upload result to {}...".format(new.table))

        with self.atomic(engine):
            self.logger.info("checkout")
            img = repo.head
            img.checkout(force=True)

            self.logger.info("df to table")
            df_to_table(new.value, repository=repo, table=new.table, if_exists='replace')

            self.logger.info("commit")
            new_img = repo.commit(comment=new.comment, chunk_size=10000)
            new_img.tag(new.tag)


        # if (repo.diff(new.table, img, new_img)):
        if self.auto_push:
            self.logger.info("push")
            repo.push(
                self.get_upstream(repo),
                handler="S3",
                overwrite_objects=True,
                overwrite_tags=True,
                reupload_objects=True,
            )

        engine.close()
        self.logger.info("Finished uploading result to {}...".format(new.table))

        return new
Ejemplo n.º 18
0
def rm_c(image_spec, yes):
    """
    Delete schemas, repositories or images.

    If the target of this command is a Postgres schema, this performs DROP SCHEMA CASCADE.

    If the target of this command is a Splitgraph repository, this deletes the repository and all of its history.

    If the target of this command is an image, this deletes the image and all of its children.

    In any case, this command will ask for confirmation of the deletion, unless ``-y`` is passed. If ``-r``
    (``--remote``), is passed, this will perform deletion on a remote Splitgraph engine (registered in the config)
    instead, assuming the user has write access to the remote repository.

    This does not delete any physical objects that the deleted repository/images depend on:
    use ``sgr cleanup`` to do that.

    Examples:

    ``sgr rm temporary_schema``

    Deletes ``temporary_schema`` from the local engine.

    ``sgr rm --remote data.splitgraph.com username/repo``

    Deletes ``username/repo`` from the Splitgraph registry.

    ``sgr rm -y username/repo:old_branch``

    Deletes the image pointed to by ``old_branch`` as well as all of its children (images created by a commit based
    on this image), as well as all of the tags that point to now deleted images, without asking for confirmation.
    Note this will not delete images that import tables from the deleted images via Splitfiles or indeed the
    physical objects containing the actual tables.
    """
    from splitgraph.core.repository import Repository
    from splitgraph.engine import get_engine
    from splitgraph.core.engine import repository_exists

    engine = get_engine()

    repository, image = image_spec
    repository = Repository.from_template(repository, engine=engine)
    if not image:
        click.echo(
            ("Repository" if repository_exists(repository) else "Postgres schema")
            + " %s will be deleted." % repository.to_schema()
        )
        if not yes:
            click.confirm("Continue? ", abort=True)

        # Don't try to "uncheckout" repositories on the registry/other remote engines
        repository.delete(uncheckout=engine.name == "LOCAL")
        repository.commit_engines()
    else:
        image = repository.images[image]
        images_to_delete = repository.images.get_all_child_images(image.image_hash)
        tags_to_delete = [t for i, t in repository.get_all_hashes_tags() if i in images_to_delete]

        click.echo("Images to be deleted:")
        click.echo("\n".join(sorted(images_to_delete)))
        click.echo("Total: %d" % len(images_to_delete))

        click.echo("\nTags to be deleted:")
        click.echo("\n".join(sorted(tags_to_delete)))
        click.echo("Total: %d" % len(tags_to_delete))

        if "HEAD" in tags_to_delete:
            # If we're deleting an image that we currently have checked out,
            # we need to make sure the rest of the metadata (e.g. current state of the audit table) is consistent,
            # it's better to disallow these deletions completely.
            raise CheckoutError(
                "Deletion will affect a checked-out image! Check out a different branch "
                "or do sgr checkout -u %s!" % repository.to_schema()
            )
        if not yes:
            click.confirm("Continue? ", abort=True)

        repository.images.delete(images_to_delete)
        repository.commit_engines()
        click.echo("Success.")
Ejemplo n.º 19
0
def import_c(image_spec, table_or_query, target_repository, target_table):
    """
    Import tables into a Splitgraph repository.

    Imports a table or a result of a query from a local Splitgraph repository or a Postgres schema into another
    Splitgraph repository.

    Examples:

    ``sgr import noaa/climate:my_tag climate_data my/repository``

    Create a new image in ``my/repository`` with the ``climate_data`` table included. This links the new image to
    the physical object, meaning that the history of the ``climate_data`` table is preserved.

    If no tag is specified, the 'latest' (not the HEAD image or current state of the checked out image)
    image is used.

    ``sgr import noaa/climate:my_tag "SELECT * FROM climate_data" my/repository climate_data``

    Create a new image in ``my/repository`` with the result of the query stored in the ``climate_data`` table. This
    creates a new physical object without any linkage to the original data, so the history of the ``climate_data``
    table isn't preserved. The SQL query can interact with multiple tables in the source image.

    ``sgr import other_schema other_table my/repository``

    Since other_schema isn't a Splitgraph repository, this will copy ``other_schema.other_table``
    into a new Splitgraph object and add the ``other_table`` table to a new image in ``my/repository``.

    Note that importing doesn't discard or commit pending changes in the target Splitgraph repository: a new image
    is created with the new table added, the new table is materialized in the repository and the HEAD pointer is moved.
    """
    from splitgraph.core.engine import repository_exists

    repository, image = image_spec

    if repository_exists(repository):
        foreign_table = False
        image = repository.images[image]
        # If the source table doesn't exist in the image, we'll treat it as a query instead.
        try:
            image.get_table(table_or_query)
            is_query = False
        except TableNotFoundError:
            is_query = True
    else:
        # If the source schema isn't actually a Splitgraph repo, we'll be copying the table verbatim.
        foreign_table = True
        is_query = table_or_query not in repository.engine.get_all_tables(
            repository.to_schema())
        image = None

    if is_query and not target_table:
        click.echo("TARGET_TABLE is required when the source is a query!")
        sys.exit(1)

    target_repository.import_tables(
        [target_table] if target_table else [],
        repository,
        [table_or_query],
        image_hash=image.image_hash if image else None,
        foreign_tables=foreign_table,
        table_queries=[] if not is_query else [True],
    )

    click.echo("%s:%s has been imported from %s:%s%s" % (
        str(target_repository),
        target_table,
        str(repository),
        table_or_query,
        (" (%s)" % image.image_hash[:12] if image else ""),
    ))
    def write(self, value_: Any, **kwargs: Any) -> Result:
        """
        Writes the result to a repository on Splitgraph


        Args:
            - value_ (Any): the value to write; will then be stored as the `value` attribute
                of the returned `Result` instance
            - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag`

        Returns:
            - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes
        """

        if self.schema is not None:
            errors = self.schema.validate(value_)
            if errors:
                raise SchemaValidationError(errors)

        new = self.format(**kwargs)
        new.value = value_

        repo_info = parse_repo(new.location)

        repo = Repository(namespace=repo_info.namespace,
                          repository=repo_info.repository)
        remote = Repository.from_template(repo,
                                          engine=get_engine(
                                              repo_info.remote_name,
                                              autocommit=True))

        assert isinstance(value_, pd.DataFrame)

        if not repository_exists(repo) and self.auto_init_repo:
            self.logger.info("Creating repo {}/{}...".format(
                repo.namespace, repo.repository))
            repo.init()

        # TODO: Retrieve the repo from bedrock first

        self.logger.info("Starting to upload result to {}...".format(
            new.location))

        with self.atomic(repo.engine):
            self.logger.info("checkout")
            img = repo.head

            img.checkout(force=True)

            self.logger.info("df to table")
            df_to_table(new.value,
                        repository=repo,
                        table=repo_info.table,
                        if_exists='replace')

            self.logger.info("commit")
            new_img = repo.commit(comment=new.comment, chunk_size=10000)
            new_img.tag(repo_info.tag)

        # if (repo.diff(new.table, img, new_img)):
        if self.auto_push:
            self.logger.info("push")
            repo.push(
                remote,
                handler="S3",
                overwrite_objects=True,
                overwrite_tags=True,
                reupload_objects=True,
            )

        self.logger.info("Finished uploading result to {}...".format(
            new.location))

        return new
Ejemplo n.º 21
0
 def _initialize_output(output):
     if not repository_exists(output):
         nonlocal repo_created
         output.init()
         repo_created = True