def sql_c(remote, show_all, json, query): """ Run SQL on or connect to the Splitgraph Data Delivery Network. If a query isn't passed, this will return a libpq-compatible connection string to the registry's SQL endpoint. It can be used to connect to the endpoint with other SQL clients: ``` pgcli $(sgr cloud sql) ``` If a query is passed, this will run an SQL query against the SQL endpoint. """ ddn_params = _get_ddn_conn_params(remote) from splitgraph.engine.postgres.engine import get_conn_str, PostgresEngine if not query: click.echo(get_conn_str(ddn_params)) return # Build an engine to connect to the DDN, disable pre-flight API checks etc engine = PostgresEngine(name=remote, conn_params=ddn_params, registry=False, check_version=False) try: results = engine.run_sql(query) emit_sql_results(results, use_json=json, show_all=show_all) finally: engine.close()
def test_engine_autocommit(local_engine_empty): conn_params = _prepare_engine_config(CONFIG) engine = PostgresEngine(conn_params=conn_params, name="test_engine", autocommit=True) repo = Repository("test", "repo", engine=engine) repo.init() repo.engine.rollback() assert repository_exists(Repository.from_template(repo, engine=local_engine_empty))
def get_engine( name: Optional[str] = None, use_socket: bool = False, use_fdw_params: bool = False, autocommit: bool = False, ) -> "PostgresEngine": """ Get the current global engine or a named remote engine :param name: Name of the remote engine as specified in the config. If None, the current global engine is returned. :param use_socket: Use a local UNIX socket instead of PG_HOST, PG_PORT for LOCAL engine connections. :param use_fdw_params: Use the _FDW connection parameters (SG_ENGINE_FDW_HOST/PORT). By default, will infer from the global splitgraph.config.IN_FDW flag. :param autocommit: If True, the engine will not open SQL transactions implicitly. """ from .postgres.engine import PostgresEngine if not name: if isinstance(_ENGINE, PostgresEngine): return _ENGINE name = _ENGINE if name not in _ENGINES: # Here we'd get the engine type/backend (Postgres/MySQL etc) # and instantiate the actual Engine class. # As we only have PostgresEngine, we instantiate that. conn_params = cast(Dict[str, Optional[str]], _prepare_engine_config(CONFIG, name)) try: is_registry = bool(conn_params.pop("SG_IS_REGISTRY")) except KeyError: is_registry = False try: check_version = bool(conn_params.pop("SG_CHECK_VERSION")) except KeyError: check_version = False if name == "LOCAL" and use_socket: conn_params["SG_ENGINE_HOST"] = None conn_params["SG_ENGINE_PORT"] = None if use_fdw_params: conn_params["SG_ENGINE_HOST"] = conn_params["SG_ENGINE_FDW_HOST"] conn_params["SG_ENGINE_PORT"] = conn_params["SG_ENGINE_FDW_PORT"] _ENGINES[name] = PostgresEngine( conn_params=conn_params, name=name, autocommit=autocommit, registry=is_registry, check_version=check_version, in_fdw=use_fdw_params and name == "LOCAL", ) return _ENGINES[name]
def _make_changeset( engine: PostgresEngine, old_schema: str, old_table: str, schema: str, table: str, schema_spec: TableSchema, upsert_condition: str = "TRUE", ) -> Changeset: """Build a fake changeset from the temporary table and the existing table to pass to the object manager (store as a Splitgraph diff).""" # PK -> (upserted / deleted, old row, new row) # As a memory-saving hack, we only record the values of the old row (read from the # current table) -- this is because object creation routines read the inserted rows # from the staging table anyway. change_key = [c for c, _ in get_change_key(schema_spec)] # Query: # SELECT (new, pk, columns) AS pk, # (custom upsert condition), # (row_to_json(old non-pk cols)) AS old_row # FROM new_table n LEFT OUTER JOIN old_table o ON [o.pk = n.pk] # WHERE old row != new row query = ( SQL("SELECT ") + SQL(",").join(SQL("n.") + Identifier(c) for c in change_key) + SQL(",") + SQL(upsert_condition + " AS upserted, ") # If PK doesn't exist in the new table, old_row is null, else output it + SQL("CASE WHEN ") + SQL(" AND ").join( SQL("o.{0} IS NULL").format(Identifier(c)) for c in change_key) + SQL(" THEN '{}'::json ELSE json_build_object(") + SQL(",").join( SQL("%s, o.") + Identifier(c.name) for c in schema_spec if c.name not in change_key) + SQL(") END AS old_row FROM {}.{} n LEFT OUTER JOIN {}.{} o ON " ).format( Identifier(schema), Identifier(table), Identifier(old_schema), Identifier(old_table), ) + SQL(" AND ").join( SQL("o.{0} = n.{0}").format(Identifier(c)) for c in change_key) + SQL("WHERE o.* IS DISTINCT FROM n.*")).as_string(engine.connection) args = [c.name for c in schema_spec if c.name not in change_key] result = engine.run_sql(query, args) return {tuple(row[:-2]): (row[-2], row[-1], {}) for row in result}
def _make_changeset( engine: PostgresEngine, schema: str, table: str, schema_spec: TableSchema, upsert_condition: str = "TRUE", ) -> Changeset: """Build a fake changeset from the temporary table and the existing table to pass to the object manager (store as a Splitgraph diff).""" # PK -> (upserted / deleted, old row, new row) # We don't find out the old row here. This is because it requires a JOIN on the current # Splitgraph table, so if we're adding e.g. 100k rows to a 1M row table, it's going to cause big # performance issues. Instead, we pretend that all rows # have been inserted (apart from the ones that have been deleted by having the magic # _sdc_deleted_at column). # We also don't care about finding out the new row here, as the storage routine queries # the table directly to get those values. # The tradeoff is that now, when querying the table, we need to include not only fragments # whose index matches the query, but also all fragments that might overwrite those fragments # (through PK range overlap). Since we don't record old row values in this changeset's index, # we can no longer find if a fragment deletes some row by inspecting the index -- we need to # use PK ranges to find out overlapping fragments. change_key = [c for c, _ in get_change_key(schema_spec)] # Query: # SELECT (col_1, col_2, ...) AS pk, # (custom upsert condition), # {} AS old_row # FROM new_table n query = (SQL("SELECT ") + SQL(",").join(SQL("n.") + Identifier(c) for c in change_key) + SQL(",") + SQL(upsert_condition + " AS upserted FROM {}.{} n").format( Identifier(schema), Identifier(table))).as_string( engine.connection) result = engine.run_sql(query) return {tuple(row[:-1]): (row[-1], {}, {}) for row in result}
def add_engine_c(image, port, username, no_init, no_sgconfig, inject_source, no_pull, name, password, set_default): """ Create and start a Splitgraph engine. This will pull the Splitgraph engine image, start it, create a Postgres user and initialize the engine. This also creates Docker volumes required to persist data/metadata. The engine Docker container by default will be named `splitgraph_engine_default` and its data and metadata volumes will have names `splitgraph_engine_default_data` and `splitgraph_engine_default_metadata`. """ from splitgraph.engine.postgres.engine import PostgresEngine from splitgraph.config import CONFIG from docker.types import Mount client = get_docker_client() if not no_pull: click.echo("Pulling image %s..." % image) _pretty_pull(client, image) container_name = _get_container_name(name) data_name = _get_data_volume_name(name) metadata_name = _get_metadata_volume_name(name) # Setup required mounts for data/metadata data_volume = Mount(target="/var/lib/splitgraph/objects", source=data_name, type="volume") metadata_volume = Mount(target="/var/lib/postgresql/data", source=metadata_name, type="volume") mounts = [data_volume, metadata_volume] click.echo("Creating container %s." % container_name) click.echo("Data volume: %s." % data_name) click.echo("Metadata volume: %s." % metadata_name) if inject_source: source_path = _convert_source_path( os.getenv( "SG_SOURCE_ROOT", os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))) source_volume = Mount(target="/splitgraph/splitgraph", source=source_path, type="bind") mounts.append(source_volume) click.echo("Source path: %s" % source_path) container = client.containers.run( image=image, detach=True, name=container_name, ports={"5432/tcp": port}, mounts=mounts, environment={ "POSTGRES_USER": username, "POSTGRES_PASSWORD": password, "POSTGRES_DB": "splitgraph", # Actual config to be injected later "SG_CONFIG_FILE": "/.sgconfig", }, ) click.echo("Container created, ID %s" % container.short_id) # Extract the host that we can reach the container on # (might be different from localhost if docker-machine is used) hostname = urlparse(client.api.base_url).hostname conn_params: Dict[str, str] = { "SG_ENGINE_HOST": hostname, "SG_ENGINE_PORT": str(port), # Even if the engine is exposed on a different port on the host, # need to make sure that it uses the default 5432 port to connect # to itself. "SG_ENGINE_FDW_HOST": "localhost", "SG_ENGINE_FDW_PORT": "5432", "SG_ENGINE_USER": username, "SG_ENGINE_PWD": password, "SG_ENGINE_DB_NAME": "splitgraph", "SG_ENGINE_POSTGRES_DB_NAME": "postgres", "SG_ENGINE_ADMIN_USER": username, "SG_ENGINE_ADMIN_PWD": password, } if not no_sgconfig: if name != DEFAULT_ENGINE and not set_default: config_patch = {"remotes": {name: conn_params}} else: config_patch = conn_params config_path = patch_and_save_config(CONFIG, config_patch) else: config_path = CONFIG["SG_CONFIG_FILE"] if not no_init: engine = PostgresEngine(name=name, conn_params=conn_params) engine.initialize() engine.commit() click.echo("Engine initialized successfully.") inject_config_into_engines(CONFIG["SG_ENGINE_PREFIX"], config_path) click.echo("Done.")
def test_commandline_engine_creation_list_stop_deletion(teardown_test_engine): runner = CliRunner() client = docker.from_env() # Create an engine with default password and wait for it to initialize result = runner.invoke( add_engine_c, [ "--image", _get_test_engine_image(), "--no-pull", "--port", "5428", "--username", "not_sgr", "--no-sgconfig", TEST_ENGINE_NAME, ], input="notsosecure\nnotsosecure\n", ) assert result.exit_code == 0 # Connect to the engine to check that it's up conn_params = { "SG_ENGINE_HOST": "localhost", "SG_ENGINE_PORT": "5428", "SG_ENGINE_USER": "******", "SG_ENGINE_PWD": "notsosecure", "SG_ENGINE_DB_NAME": "splitgraph", "SG_ENGINE_POSTGRES_DB_NAME": "postgres", "SG_ENGINE_ADMIN_USER": "******", "SG_ENGINE_ADMIN_PWD": "notsosecure", } engine = PostgresEngine(name="test", conn_params=conn_params) assert engine.run_sql("SELECT * FROM splitgraph_meta.images") == [] engine.close() # List running engines result = runner.invoke(list_engines_c) assert result.exit_code == 0 assert TEST_ENGINE_NAME in result.stdout assert "running" in result.stdout # Check engine version # (we didn't put it into the .sgconfig so have to patch instead) with patch("splitgraph.engine.get_engine", return_value=engine): result = runner.invoke(version_engine_c, [TEST_ENGINE_NAME]) assert result.exit_code == 0 assert __version__ in result.stdout # Get engine logs (no --follow since we won't be able to interrupt it) result = runner.invoke(log_engine_c, [TEST_ENGINE_NAME]) assert "database system is ready to accept connections" in result.stdout # Try deleting the engine while it's still running with pytest.raises(docker.errors.APIError): runner.invoke(delete_engine_c, ["-y", TEST_ENGINE_NAME], catch_exceptions=False) # Stop the engine result = runner.invoke(stop_engine_c, [TEST_ENGINE_NAME]) assert result.exit_code == 0 # Check it's not running for c in client.containers.list(filters={"ancestor": "splitgraph/engine"}, all=False): assert c.name != "splitgraph_test_engine_" + TEST_ENGINE_NAME result = runner.invoke(list_engines_c) assert TEST_ENGINE_NAME not in result.stdout result = runner.invoke(list_engines_c, ["-a"]) assert TEST_ENGINE_NAME in result.stdout # Bring it back up result = runner.invoke(start_engine_c, [TEST_ENGINE_NAME]) assert result.exit_code == 0 # Check it's running result = runner.invoke(list_engines_c) assert result.exit_code == 0 assert TEST_ENGINE_NAME in result.stdout assert "running" in result.stdout # Try upgrading it to the same engine version as a smoke test with patch("splitgraph.engine.get_engine", return_value=engine): # Make sure the connection is closed as the client will use this Engine reference # after the upgrade to initialize it. engine.close() result = runner.invoke( upgrade_engine_c, [ TEST_ENGINE_NAME, "--image", _get_test_engine_image(), "--no-pull" ], catch_exceptions=False, ) assert result.exit_code == 0 assert "Upgraded engine %s to %s" % (TEST_ENGINE_NAME, __version__) in result.stdout # Check the engine is running and has the right version result = runner.invoke(list_engines_c) assert result.exit_code == 0 assert TEST_ENGINE_NAME in result.stdout assert "running" in result.stdout result = runner.invoke(version_engine_c, [TEST_ENGINE_NAME]) assert result.exit_code == 0 assert __version__ in result.stdout # Force delete it result = runner.invoke(delete_engine_c, ["-f", "--with-volumes", TEST_ENGINE_NAME], input="y\n") assert result.exit_code == 0 # Check the engine (and the volumes) are gone for c in client.containers.list(filters={"ancestor": "splitgraph/engine"}, all=False): assert c.name != "splitgraph_test_engine_" + TEST_ENGINE_NAME for v in client.volumes.list(): assert not v.name.startswith("splitgraph_test_engine_" + TEST_ENGINE_NAME)
def test_disjoint_table_lq_temp_table_deletion_doesnt_lock_up(pg_repo_local): # When Multicorn reads from the temporary table, it does that in the context of the # transaction that it's been called from. It hence can hold a read lock on the # temporary table that it won't release until the scan is over but the scan won't # be over until we've managed to delete the temporary table, leading to a deadlock. # Check deleting the table in a separate thread works. prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql( "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')") fruits = pg_repo_local.commit().get_table("fruits") tables, callback, plan = fruits.query_indirect( columns=["fruit_id", "name"], quals=None) # Force a materialization and get the query that makes us read from the temporary table. last_table = list(tables)[-1] pg_repo_local.commit_engines() # Simulate Multicorn reading from the table by doing it from a different engine. conn_params = _prepare_engine_config(CONFIG) engine = PostgresEngine(conn_params=conn_params, name="test_engine") query = _generate_select_query(engine, last_table, ["fruit_id", "name"]) engine.run_sql(query) logging.info("Acquired read lock") # At this point, we're holding a lock on the table and callback will lock up, unless it's run # in a separate thread. callback(from_fdw=True) # Wait for the thread to actually start (a bit gauche but otherwise makes the test flaky(ier)) time.sleep(1) # We still can read from the table (callback spawned a thread which is locked trying to delete it). engine.run_sql(query) # Now drop the lock. logging.info("Dropping the lock") engine.rollback() # If this test assertion fails, make sure we're not left holding the lock. engine.autocommit = True with pytest.raises(ObjectNotFoundError): engine.run_sql(query)
def write(self, value_: Any, **kwargs: Any) -> Result: """ Writes the result to a repository on Splitgraph Args: - value_ (Any): the value to write; will then be stored as the `value` attribute of the returned `Result` instance - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag` Returns: - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes """ cfg = patch_config(create_config_dict(), self.env or dict()) engine = PostgresEngine(name='SplitgraphResult', conn_params=cfg) engine.initialize() repo = Repository(namespace=self.namespace, repository=self.repo_name, engine=engine) assert isinstance(value_, pd.DataFrame) assert engine.connected if not repository_exists(repo) and self.auto_init_repo: self.logger.info("Creating repo {}/{}...".format(repo.namespace, repo.repository)) repo.init() # TODO: Retrieve the repo from bedrock first new = self.format(**kwargs) new.value = value_ self.logger.info("Starting to upload result to {}...".format(new.table)) with self.atomic(engine): self.logger.info("checkout") img = repo.head img.checkout(force=True) self.logger.info("df to table") df_to_table(new.value, repository=repo, table=new.table, if_exists='replace') self.logger.info("commit") new_img = repo.commit(comment=new.comment, chunk_size=10000) new_img.tag(new.tag) # if (repo.diff(new.table, img, new_img)): if self.auto_push: self.logger.info("push") repo.push( self.get_upstream(repo), handler="S3", overwrite_objects=True, overwrite_tags=True, reupload_objects=True, ) engine.close() self.logger.info("Finished uploading result to {}...".format(new.table)) return new
def get_splitgraph_engine(self): # We assume the engine is initialized already. Initializing it here requires an # out-of-band admin connection that might mess with dbt's connection handling. return PostgresEngine(name="LOCAL", pool=FakeConnectionPool( self.get_thread_connection().handle))