Example #1
0
def sql_c(remote, show_all, json, query):
    """
    Run SQL on or connect to the Splitgraph Data Delivery Network.

    If a query isn't passed, this will return a libpq-compatible connection string to
    the registry's SQL endpoint. It can be used to connect to the endpoint with other SQL clients:

    ```
    pgcli $(sgr cloud sql)
    ```

    If a query is passed, this will run an SQL query against the SQL endpoint.
    """
    ddn_params = _get_ddn_conn_params(remote)
    from splitgraph.engine.postgres.engine import get_conn_str, PostgresEngine

    if not query:
        click.echo(get_conn_str(ddn_params))
        return

    # Build an engine to connect to the DDN, disable pre-flight API checks etc
    engine = PostgresEngine(name=remote,
                            conn_params=ddn_params,
                            registry=False,
                            check_version=False)

    try:
        results = engine.run_sql(query)
        emit_sql_results(results, use_json=json, show_all=show_all)
    finally:
        engine.close()
Example #2
0
def test_engine_autocommit(local_engine_empty):
    conn_params = _prepare_engine_config(CONFIG)
    engine = PostgresEngine(conn_params=conn_params, name="test_engine", autocommit=True)

    repo = Repository("test", "repo", engine=engine)
    repo.init()

    repo.engine.rollback()
    assert repository_exists(Repository.from_template(repo, engine=local_engine_empty))
Example #3
0
def get_engine(
    name: Optional[str] = None,
    use_socket: bool = False,
    use_fdw_params: bool = False,
    autocommit: bool = False,
) -> "PostgresEngine":
    """
    Get the current global engine or a named remote engine

    :param name: Name of the remote engine as specified in the config. If None, the current global engine
        is returned.
    :param use_socket: Use a local UNIX socket instead of PG_HOST, PG_PORT for LOCAL engine connections.
    :param use_fdw_params: Use the _FDW connection parameters (SG_ENGINE_FDW_HOST/PORT). By default,
        will infer from the global splitgraph.config.IN_FDW flag.
    :param autocommit: If True, the engine will not open SQL transactions implicitly.
    """
    from .postgres.engine import PostgresEngine

    if not name:
        if isinstance(_ENGINE, PostgresEngine):
            return _ENGINE
        name = _ENGINE
    if name not in _ENGINES:
        # Here we'd get the engine type/backend (Postgres/MySQL etc)
        # and instantiate the actual Engine class.
        # As we only have PostgresEngine, we instantiate that.

        conn_params = cast(Dict[str, Optional[str]],
                           _prepare_engine_config(CONFIG, name))

        try:
            is_registry = bool(conn_params.pop("SG_IS_REGISTRY"))
        except KeyError:
            is_registry = False
        try:
            check_version = bool(conn_params.pop("SG_CHECK_VERSION"))
        except KeyError:
            check_version = False

        if name == "LOCAL" and use_socket:
            conn_params["SG_ENGINE_HOST"] = None
            conn_params["SG_ENGINE_PORT"] = None
        if use_fdw_params:
            conn_params["SG_ENGINE_HOST"] = conn_params["SG_ENGINE_FDW_HOST"]
            conn_params["SG_ENGINE_PORT"] = conn_params["SG_ENGINE_FDW_PORT"]

        _ENGINES[name] = PostgresEngine(
            conn_params=conn_params,
            name=name,
            autocommit=autocommit,
            registry=is_registry,
            check_version=check_version,
            in_fdw=use_fdw_params and name == "LOCAL",
        )
    return _ENGINES[name]
Example #4
0
def _make_changeset(
    engine: PostgresEngine,
    old_schema: str,
    old_table: str,
    schema: str,
    table: str,
    schema_spec: TableSchema,
    upsert_condition: str = "TRUE",
) -> Changeset:
    """Build a fake changeset from the temporary table and the existing table to pass
    to the object manager (store as a Splitgraph diff)."""

    # PK -> (upserted / deleted, old row, new row)
    # As a memory-saving hack, we only record the values of the old row (read from the
    # current table) -- this is because object creation routines read the inserted rows
    # from the staging table anyway.
    change_key = [c for c, _ in get_change_key(schema_spec)]
    # Query:
    # SELECT (new, pk, columns) AS pk,
    #        (custom upsert condition),
    #        (row_to_json(old non-pk cols)) AS old_row
    # FROM new_table n LEFT OUTER JOIN old_table o ON [o.pk = n.pk]
    # WHERE old row != new row
    query = (
        SQL("SELECT ") +
        SQL(",").join(SQL("n.") + Identifier(c) for c in change_key) +
        SQL(",") + SQL(upsert_condition + " AS upserted, ")
        # If PK doesn't exist in the new table, old_row is null, else output it
        + SQL("CASE WHEN ") + SQL(" AND ").join(
            SQL("o.{0} IS NULL").format(Identifier(c)) for c in change_key) +
        SQL(" THEN '{}'::json ELSE json_build_object(") + SQL(",").join(
            SQL("%s, o.") + Identifier(c.name)
            for c in schema_spec if c.name not in change_key) +
        SQL(") END AS old_row FROM {}.{} n LEFT OUTER JOIN {}.{} o ON "
            ).format(
                Identifier(schema),
                Identifier(table),
                Identifier(old_schema),
                Identifier(old_table),
            ) + SQL(" AND ").join(
                SQL("o.{0} = n.{0}").format(Identifier(c))
                for c in change_key) +
        SQL("WHERE o.* IS DISTINCT FROM n.*")).as_string(engine.connection)
    args = [c.name for c in schema_spec if c.name not in change_key]
    result = engine.run_sql(query, args)
    return {tuple(row[:-2]): (row[-2], row[-1], {}) for row in result}
Example #5
0
def _make_changeset(
    engine: PostgresEngine,
    schema: str,
    table: str,
    schema_spec: TableSchema,
    upsert_condition: str = "TRUE",
) -> Changeset:
    """Build a fake changeset from the temporary table and the existing table to pass
    to the object manager (store as a Splitgraph diff)."""

    # PK -> (upserted / deleted, old row, new row)
    # We don't find out the old row here. This is because it requires a JOIN on the current
    # Splitgraph table, so if we're adding e.g. 100k rows to a 1M row table, it's going to cause big
    # performance issues. Instead, we pretend that all rows
    # have been inserted (apart from the ones that have been deleted by having the magic
    # _sdc_deleted_at column).

    # We also don't care about finding out the new row here, as the storage routine queries
    # the table directly to get those values.

    # The tradeoff is that now, when querying the table, we need to include not only fragments
    # whose index matches the query, but also all fragments that might overwrite those fragments
    # (through PK range overlap). Since we don't record old row values in this changeset's index,
    # we can no longer find if a fragment deletes some row by inspecting the index -- we need to
    # use PK ranges to find out overlapping fragments.

    change_key = [c for c, _ in get_change_key(schema_spec)]
    # Query:
    # SELECT (col_1, col_2, ...) AS pk,
    #        (custom upsert condition),
    #        {} AS old_row
    # FROM new_table n
    query = (SQL("SELECT ") +
             SQL(",").join(SQL("n.") + Identifier(c)
                           for c in change_key) + SQL(",") +
             SQL(upsert_condition + " AS upserted FROM {}.{} n").format(
                 Identifier(schema), Identifier(table))).as_string(
                     engine.connection)
    result = engine.run_sql(query)
    return {tuple(row[:-1]): (row[-1], {}, {}) for row in result}
Example #6
0
def add_engine_c(image, port, username, no_init, no_sgconfig, inject_source,
                 no_pull, name, password, set_default):
    """
    Create and start a Splitgraph engine.

    This will pull the Splitgraph engine image, start it, create a Postgres user and initialize
    the engine.

    This also creates Docker volumes required to persist data/metadata.

    The engine Docker container by default will be named `splitgraph_engine_default` and
    its data and metadata volumes will have names `splitgraph_engine_default_data` and
    `splitgraph_engine_default_metadata`.
    """
    from splitgraph.engine.postgres.engine import PostgresEngine
    from splitgraph.config import CONFIG
    from docker.types import Mount

    client = get_docker_client()

    if not no_pull:
        click.echo("Pulling image %s..." % image)
        _pretty_pull(client, image)

    container_name = _get_container_name(name)
    data_name = _get_data_volume_name(name)
    metadata_name = _get_metadata_volume_name(name)

    # Setup required mounts for data/metadata
    data_volume = Mount(target="/var/lib/splitgraph/objects",
                        source=data_name,
                        type="volume")
    metadata_volume = Mount(target="/var/lib/postgresql/data",
                            source=metadata_name,
                            type="volume")
    mounts = [data_volume, metadata_volume]

    click.echo("Creating container %s." % container_name)
    click.echo("Data volume: %s." % data_name)
    click.echo("Metadata volume: %s." % metadata_name)

    if inject_source:
        source_path = _convert_source_path(
            os.getenv(
                "SG_SOURCE_ROOT",
                os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             "../"))))
        source_volume = Mount(target="/splitgraph/splitgraph",
                              source=source_path,
                              type="bind")
        mounts.append(source_volume)
        click.echo("Source path: %s" % source_path)

    container = client.containers.run(
        image=image,
        detach=True,
        name=container_name,
        ports={"5432/tcp": port},
        mounts=mounts,
        environment={
            "POSTGRES_USER": username,
            "POSTGRES_PASSWORD": password,
            "POSTGRES_DB": "splitgraph",
            # Actual config to be injected later
            "SG_CONFIG_FILE": "/.sgconfig",
        },
    )

    click.echo("Container created, ID %s" % container.short_id)

    # Extract the host that we can reach the container on
    # (might be different from localhost if docker-machine is used)
    hostname = urlparse(client.api.base_url).hostname

    conn_params: Dict[str, str] = {
        "SG_ENGINE_HOST": hostname,
        "SG_ENGINE_PORT": str(port),
        # Even if the engine is exposed on a different port on the host,
        # need to make sure that it uses the default 5432 port to connect
        # to itself.
        "SG_ENGINE_FDW_HOST": "localhost",
        "SG_ENGINE_FDW_PORT": "5432",
        "SG_ENGINE_USER": username,
        "SG_ENGINE_PWD": password,
        "SG_ENGINE_DB_NAME": "splitgraph",
        "SG_ENGINE_POSTGRES_DB_NAME": "postgres",
        "SG_ENGINE_ADMIN_USER": username,
        "SG_ENGINE_ADMIN_PWD": password,
    }

    if not no_sgconfig:
        if name != DEFAULT_ENGINE and not set_default:
            config_patch = {"remotes": {name: conn_params}}
        else:
            config_patch = conn_params

        config_path = patch_and_save_config(CONFIG, config_patch)
    else:
        config_path = CONFIG["SG_CONFIG_FILE"]

    if not no_init:
        engine = PostgresEngine(name=name, conn_params=conn_params)
        engine.initialize()
        engine.commit()
        click.echo("Engine initialized successfully.")

    inject_config_into_engines(CONFIG["SG_ENGINE_PREFIX"], config_path)
    click.echo("Done.")
Example #7
0
def test_commandline_engine_creation_list_stop_deletion(teardown_test_engine):
    runner = CliRunner()
    client = docker.from_env()

    # Create an engine with default password and wait for it to initialize
    result = runner.invoke(
        add_engine_c,
        [
            "--image",
            _get_test_engine_image(),
            "--no-pull",
            "--port",
            "5428",
            "--username",
            "not_sgr",
            "--no-sgconfig",
            TEST_ENGINE_NAME,
        ],
        input="notsosecure\nnotsosecure\n",
    )
    assert result.exit_code == 0

    # Connect to the engine to check that it's up
    conn_params = {
        "SG_ENGINE_HOST": "localhost",
        "SG_ENGINE_PORT": "5428",
        "SG_ENGINE_USER": "******",
        "SG_ENGINE_PWD": "notsosecure",
        "SG_ENGINE_DB_NAME": "splitgraph",
        "SG_ENGINE_POSTGRES_DB_NAME": "postgres",
        "SG_ENGINE_ADMIN_USER": "******",
        "SG_ENGINE_ADMIN_PWD": "notsosecure",
    }

    engine = PostgresEngine(name="test", conn_params=conn_params)
    assert engine.run_sql("SELECT * FROM splitgraph_meta.images") == []
    engine.close()

    # List running engines
    result = runner.invoke(list_engines_c)
    assert result.exit_code == 0
    assert TEST_ENGINE_NAME in result.stdout
    assert "running" in result.stdout

    # Check engine version
    # (we didn't put it into the .sgconfig so have to patch instead)
    with patch("splitgraph.engine.get_engine", return_value=engine):
        result = runner.invoke(version_engine_c, [TEST_ENGINE_NAME])
        assert result.exit_code == 0
        assert __version__ in result.stdout

    # Get engine logs (no --follow since we won't be able to interrupt it)
    result = runner.invoke(log_engine_c, [TEST_ENGINE_NAME])
    assert "database system is ready to accept connections" in result.stdout

    # Try deleting the engine while it's still running
    with pytest.raises(docker.errors.APIError):
        runner.invoke(delete_engine_c, ["-y", TEST_ENGINE_NAME],
                      catch_exceptions=False)

    # Stop the engine
    result = runner.invoke(stop_engine_c, [TEST_ENGINE_NAME])
    assert result.exit_code == 0

    # Check it's not running
    for c in client.containers.list(filters={"ancestor": "splitgraph/engine"},
                                    all=False):
        assert c.name != "splitgraph_test_engine_" + TEST_ENGINE_NAME

    result = runner.invoke(list_engines_c)
    assert TEST_ENGINE_NAME not in result.stdout

    result = runner.invoke(list_engines_c, ["-a"])
    assert TEST_ENGINE_NAME in result.stdout

    # Bring it back up
    result = runner.invoke(start_engine_c, [TEST_ENGINE_NAME])
    assert result.exit_code == 0

    # Check it's running
    result = runner.invoke(list_engines_c)
    assert result.exit_code == 0
    assert TEST_ENGINE_NAME in result.stdout
    assert "running" in result.stdout

    # Try upgrading it to the same engine version as a smoke test
    with patch("splitgraph.engine.get_engine", return_value=engine):
        # Make sure the connection is closed as the client will use this Engine reference
        # after the upgrade to initialize it.
        engine.close()

        result = runner.invoke(
            upgrade_engine_c,
            [
                TEST_ENGINE_NAME, "--image",
                _get_test_engine_image(), "--no-pull"
            ],
            catch_exceptions=False,
        )
        assert result.exit_code == 0
        assert "Upgraded engine %s to %s" % (TEST_ENGINE_NAME,
                                             __version__) in result.stdout

        # Check the engine is running and has the right version
        result = runner.invoke(list_engines_c)
        assert result.exit_code == 0
        assert TEST_ENGINE_NAME in result.stdout
        assert "running" in result.stdout

        result = runner.invoke(version_engine_c, [TEST_ENGINE_NAME])
        assert result.exit_code == 0
        assert __version__ in result.stdout

    # Force delete it
    result = runner.invoke(delete_engine_c,
                           ["-f", "--with-volumes", TEST_ENGINE_NAME],
                           input="y\n")
    assert result.exit_code == 0

    # Check the engine (and the volumes) are gone
    for c in client.containers.list(filters={"ancestor": "splitgraph/engine"},
                                    all=False):
        assert c.name != "splitgraph_test_engine_" + TEST_ENGINE_NAME
    for v in client.volumes.list():
        assert not v.name.startswith("splitgraph_test_engine_" +
                                     TEST_ENGINE_NAME)
Example #8
0
def test_disjoint_table_lq_temp_table_deletion_doesnt_lock_up(pg_repo_local):
    # When Multicorn reads from the temporary table, it does that in the context of the
    # transaction that it's been called from. It hence can hold a read lock on the
    # temporary table that it won't release until the scan is over but the scan won't
    # be over until we've managed to delete the temporary table, leading to a deadlock.
    # Check deleting the table in a separate thread works.

    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql(
        "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')")
    fruits = pg_repo_local.commit().get_table("fruits")

    tables, callback, plan = fruits.query_indirect(
        columns=["fruit_id", "name"], quals=None)

    # Force a materialization and get the query that makes us read from the temporary table.
    last_table = list(tables)[-1]
    pg_repo_local.commit_engines()

    # Simulate Multicorn reading from the table by doing it from a different engine.
    conn_params = _prepare_engine_config(CONFIG)
    engine = PostgresEngine(conn_params=conn_params, name="test_engine")
    query = _generate_select_query(engine, last_table, ["fruit_id", "name"])
    engine.run_sql(query)
    logging.info("Acquired read lock")

    # At this point, we're holding a lock on the table and callback will lock up, unless it's run
    # in a separate thread.
    callback(from_fdw=True)

    # Wait for the thread to actually start (a bit gauche but otherwise makes the test flaky(ier))
    time.sleep(1)

    # We still can read from the table (callback spawned a thread which is locked trying to delete it).
    engine.run_sql(query)

    # Now drop the lock.
    logging.info("Dropping the lock")
    engine.rollback()

    # If this test assertion fails, make sure we're not left holding the lock.
    engine.autocommit = True
    with pytest.raises(ObjectNotFoundError):
        engine.run_sql(query)
    def write(self, value_: Any, **kwargs: Any) -> Result:
        """
        Writes the result to a repository on Splitgraph


        Args:
            - value_ (Any): the value to write; will then be stored as the `value` attribute
                of the returned `Result` instance
            - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag`

        Returns:
            - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes
        """

        cfg = patch_config(create_config_dict(), self.env or dict())
        engine = PostgresEngine(name='SplitgraphResult', conn_params=cfg)
        engine.initialize()
        repo = Repository(namespace=self.namespace, repository=self.repo_name, engine=engine)

        assert isinstance(value_, pd.DataFrame)
        assert engine.connected

        if not repository_exists(repo) and self.auto_init_repo:
            self.logger.info("Creating repo {}/{}...".format(repo.namespace, repo.repository))
            repo.init()

        # TODO: Retrieve the repo from bedrock first

        new = self.format(**kwargs)
        new.value = value_

        self.logger.info("Starting to upload result to {}...".format(new.table))

        with self.atomic(engine):
            self.logger.info("checkout")
            img = repo.head
            img.checkout(force=True)

            self.logger.info("df to table")
            df_to_table(new.value, repository=repo, table=new.table, if_exists='replace')

            self.logger.info("commit")
            new_img = repo.commit(comment=new.comment, chunk_size=10000)
            new_img.tag(new.tag)


        # if (repo.diff(new.table, img, new_img)):
        if self.auto_push:
            self.logger.info("push")
            repo.push(
                self.get_upstream(repo),
                handler="S3",
                overwrite_objects=True,
                overwrite_tags=True,
                reupload_objects=True,
            )

        engine.close()
        self.logger.info("Finished uploading result to {}...".format(new.table))

        return new
Example #10
0
 def get_splitgraph_engine(self):
     # We assume the engine is initialized already. Initializing it here requires an
     # out-of-band admin connection that might mess with dbt's connection handling.
     return PostgresEngine(name="LOCAL",
                           pool=FakeConnectionPool(
                               self.get_thread_connection().handle))