def test_import_splitfile_reuses_hash(local_engine_empty): # Create two repositories and run the same Splitfile that loads some data from a mounted database. # Check that the same contents result in the same hash and no extra objects being created output_2 = Repository.from_schema("output_2") execute_commands(load_splitfile("import_from_mounted_db.splitfile"), output=OUTPUT) execute_commands(load_splitfile("import_from_mounted_db.splitfile"), output=output_2) head = OUTPUT.head assert head.get_table("my_fruits").objects == [ "o71ba35a5bbf8ac7779d8fe32226aaacc298773e154a4f84e9aabf829238fb1" ] assert head.get_table("o_vegetables").objects == [ "o70e726f4bf18547242722600c4723dceaaede27db8fa5e9e6d7ec39187dd86" ] assert head.get_table("vegetables").objects == [ "ob474d04a80c611fc043e8303517ac168444dc7518af60e4ccc56b3b0986470" ] assert head.get_table("all_fruits").objects == [ "o0e742bd2ea4927f5193a2c68f8d4c51ea018b1ef3e3005a50727147d2cf57b" ] head_2 = output_2.head assert head_2.get_table("my_fruits").objects == head.get_table( "my_fruits").objects assert head_2.get_table("o_vegetables").objects == head.get_table( "o_vegetables").objects assert head_2.get_table("vegetables").objects == head.get_table( "vegetables").objects assert head_2.get_table("all_fruits").objects == head.get_table( "all_fruits").objects
def lookup_repository(name: str, include_local: bool = False) -> "Repository": """ Queries the SG engines on the lookup path to locate one hosting the given repository. :param name: Repository name :param include_local: If True, also queries the local engine :return: Local or remote Repository object """ from splitgraph.core.repository import Repository template = Repository.from_schema(name) if name in _LOOKUP_PATH_OVERRIDE: return Repository( template.namespace, template.repository, get_engine(_LOOKUP_PATH_OVERRIDE[name]) ) # Currently just check if the schema with that name exists on the remote. if include_local and repository_exists(template): return template for engine in _LOOKUP_PATH: candidate = Repository(template.namespace, template.repository, get_engine(engine)) if repository_exists(candidate): return candidate candidate.engine.close() raise RepositoryNotFoundError("Unknown repository %s!" % name)
def build_c(splitfile, args, output_repository): """ Build Splitgraph images. This executes a Splitfile, building a new image or checking it out from cache if the same image had already been built. Examples: ``sgr build my.splitfile`` Executes ``my.splitfile`` and writes its output into a new repository with the same name as the Splitfile (my) unless the name is specified in the Splitfile. ``sgr build my.splitfile -o mynew/repo`` Executes ``my.splitfile`` and writes its output into ``mynew/repo``. ``sgr build my_other.splitfile -o mynew/otherrepo --args PARAM1 VAL1 --args PARAM2 VAL2`` Executes ``my_other.splitfile`` with parameters ``PARAM1`` and ``PARAM2`` set to ``VAL1`` and ``VAL2``, respectively. """ from splitgraph.splitfile import execute_commands from splitgraph.core.repository import Repository args = {k: v for k, v in args} click.echo("Executing Splitfile %s with arguments %r" % (splitfile.name, args)) if output_repository is None: file_name = os.path.splitext(os.path.basename(splitfile.name))[0] output_repository = Repository.from_schema(file_name) execute_commands(splitfile.read(), args, output=output_repository)
def _execute_db_import(conn_string, fdw_name, fdw_params, table_names, target_mountpoint, table_aliases, table_queries) -> ProvenanceLine: mount_handler = get_mount_handler(fdw_name) tmp_mountpoint = Repository.from_schema(fdw_name + "_tmp_staging") tmp_mountpoint.delete() try: handler_kwargs = json.loads(fdw_params) handler_kwargs.update( conn_string_to_dict(conn_string.group() if conn_string else None)) mount_handler(tmp_mountpoint.to_schema(), **handler_kwargs) # The foreign database is a moving target, so the new image hash is random. # Maybe in the future, when the object hash is a function of its contents, we can be smarter here... target_hash = "{:064x}".format(getrandbits(256)) target_mountpoint.import_tables( table_aliases, tmp_mountpoint, table_names, target_hash=target_hash, foreign_tables=True, table_queries=table_queries, ) return {"type": "MOUNT"} finally: tmp_mountpoint.delete()
def test_singer_ingestion_errors(local_engine_empty): runner = CliRunner(mix_stderr=False) with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 # Default strategy: delete image on failure with open(os.path.join(INGESTION_RESOURCES, "singer/wrong_schema.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=True) assert result.exit_code == 1 assert isinstance(result.exception, psycopg2.errors.InvalidDatetimeFormat) repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 1 # Keep new image with open(os.path.join(INGESTION_RESOURCES, "singer/wrong_schema.json"), "r") as f: result = runner.invoke( singer_target, [TEST_REPO + ":latest", "--failure=keep-both"], input=f, catch_exceptions=True, ) assert result.exit_code == 1 assert isinstance(result.exception, psycopg2.errors.InvalidDatetimeFormat) repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 2 # The "stargazers" table is still the same but the "releases" table managed to get updated. image = repo.images["latest"] assert sorted(image.get_tables()) == ["releases", "stargazers"] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 7 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 5
def test_push_target( repository, remote_repository, remote, available_remotes, upstream, expected_target, expected_remote, ): repository = Repository.from_schema(repository) remote_repository = Repository.from_schema( remote_repository) if remote_repository else None fake_config = { "remotes": {s: { "SG_NAMESPACE": "user" } for s in available_remotes} } with mock.patch.object(Repository, "upstream", new_callable=PropertyMock) as up: up.return_value = upstream with mock.patch("splitgraph.commandline.push_pull.REMOTES", available_remotes): with mock.patch("splitgraph.commandline.push_pull.CONFIG", fake_config): with mock.patch("splitgraph.engine.get_engine") as ge: ge.return_value = Mock() ge.return_value.name = expected_remote if isinstance(expected_target, type): with pytest.raises(expected_target): _determine_push_target(repository, remote_repository, remote) else: result = _determine_push_target( repository, remote_repository, remote) if upstream: assert result == upstream else: assert result.to_schema() == expected_target ge_call = ge.mock_calls[0] assert ge_call[1][0] == expected_remote assert result.engine.name == expected_remote
def ingestion_test_repo(): repo = Repository.from_schema("test/ingestion") try: repo.delete() repo.objects.cleanup() repo.init() yield repo finally: repo.rollback_engines() repo.delete()
def test_singer_ingestion_initial(local_engine_empty): # Initial ingestion: two tables (stargazers and releases) grabbed from the output of # tap-github, truncated (simulate table creation and insertion of some rows) runner = CliRunner(mix_stderr=False) with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO], input=f, catch_exceptions=False) assert result.exit_code == 0 assert json.loads(result.stdout) == { "bookmarks": { "splitgraph/splitgraph": { "stargazers": { "since": "2020-10-14T11:06:40.852311Z" }, "releases": { "since": "2020-10-14T11:06:40.852311Z" }, } } } repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 1 image = repo.images["latest"] assert sorted(image.get_tables()) == ["releases", "stargazers"] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 5 assert repo.run_sql( "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [ (Decimal("100001"), datetime(2018, 10, 17, 22, 14, 12)), (Decimal("100002"), datetime(2018, 11, 6, 11, 26, 16)), (Decimal("100003"), datetime(2018, 12, 11, 16, 0, 42)), (Decimal("100004"), datetime(2019, 2, 18, 8, 14, 21)), (Decimal("100005"), datetime(2019, 4, 18, 2, 40, 47)), ] assert image.get_table("releases").table_schema == _RELEASES_SCHEMA assert image.get_table("releases").objects == [ "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a" ] assert image.get_table("stargazers").table_schema == _STARGAZERS_SCHEMA assert image.get_table("stargazers").objects == [ "od68e932ebc99c1a337363c1b92056dcf7fc7c6c45494bc42e1e1ec4e0c88ac" ]
def convert(self, value: str, param: Optional[Parameter], ctx: Optional[Context]) -> "Repository": from splitgraph.core.repository import Repository result = Repository.from_schema(value) if self.exists: from splitgraph.core.engine import repository_exists if not repository_exists(result): raise RepositoryNotFoundError("Unknown repository %s" % result) return result
def parse_repo_tag_or_hash(value, default="latest"): repo_image = value.split(":") tag_or_hash: Optional[str] if len(repo_image) == 2: tag_or_hash = repo_image[1] else: tag_or_hash = default from splitgraph.core.repository import Repository repo = Repository.from_schema(repo_image[0]) return repo, tag_or_hash
def _execute_from( node: Node, output: Repository) -> Tuple[Repository, Optional[ProvenanceLine]]: interesting_nodes = extract_nodes(node, ["repo_source", "repository"]) repo_source = get_first_or_none(interesting_nodes, "repo_source") output_node = get_first_or_none(interesting_nodes, "repository") provenance: Optional[ProvenanceLine] = None if output_node: # AS (output) detected, change the current output repository to it. output = Repository.from_schema(output_node.match.group(0)) logging.info("Changed output repository to %s" % str(output)) # NB this destroys all data in the case where we ran some commands in the Splitfile and then # did FROM (...) without AS repository if repository_exists(output): logging.info("Clearing all output from %s" % str(output)) output.delete() if not repository_exists(output): output.init() if repo_source: repository, tag_or_hash = parse_image_spec(repo_source) source_repo = lookup_repository(repository.to_schema(), include_local=True) if source_repo.engine.name == "LOCAL": # For local repositories, make sure to update them if they've an upstream if source_repo.upstream: source_repo.pull() # Get the target image hash from the source repo: otherwise, if the tag is, say, 'latest' and # the output has just had the base commit (000...) created in it, that commit will be the latest. clone(source_repo, local_repository=output, download_all=False) source_hash = source_repo.images[tag_or_hash].image_hash output.images.by_hash(source_hash).checkout() provenance = { "type": "FROM", "source_namespace": source_repo.namespace, "source": source_repo.repository, "source_hash": source_hash, } else: # FROM EMPTY AS repository -- initializes an empty repository (say to create a table or import # the results of a previous stage in a multistage build. # In this case, if AS repository has been specified, it's already been initialized. If not, this command # literally does nothing if not output_node: raise SplitfileError( "FROM EMPTY without AS (repository) does nothing!") return output, provenance
def test_syncable_data_source(local_engine_empty): source = IngestionTestSource(engine=local_engine_empty, credentials={}, params={}) # Initial sync repo = Repository.from_schema(TEST_REPO) repo.init() image_hash_1 = source.sync(repo, "latest") assert len(repo.images()) == 2 image = repo.images[image_hash_1] assert sorted(image.get_tables()) == ["_sg_ingestion_state", "test_table"] image.checkout() assert repo.run_sql("SELECT * FROM test_table") == [(1, "one")] assert _get_state(repo) == {"last_value": 1} # Load the data anew into a different image repo.images["0" * 64].checkout() source._load(repo.to_schema()) repo.commit_engines() assert repo.run_sql("SELECT * FROM test_table") == [(1, "one")] repo.uncheckout(force=True) # Perform a sync based on the empty image image_hash_2 = source.sync(repo, "0" * 64) assert image_hash_2 != image_hash_1 image = repo.images[image_hash_2] assert sorted(image.get_tables()) == ["_sg_ingestion_state", "test_table"] image.checkout() assert repo.run_sql("SELECT * FROM test_table") == [(1, "one")] assert _get_state(repo) == {"last_value": 1} # Perform a sync based on the ingested image image_hash_3 = source.sync(repo, image_hash_1) assert image_hash_3 != image_hash_1 image = repo.images[image_hash_3] assert sorted(image.get_tables()) == ["_sg_ingestion_state", "test_table"] image.checkout() assert repo.run_sql("SELECT * FROM test_table ORDER BY key ASC") == [ (1, "one"), (2, "two") ] assert _get_state(repo) == {"last_value": 2}
def parse_image_spec(remote_repo_node: Node) -> Tuple["Repository", str]: """ Extracts the image specification (e.g. noaa/climate:abcdef123 -> Repository('noaa', 'climate'), 'abcdef123') :param remote_repo_node: Parse node with the specification :return: Tuple of (repository object, tag or hash) """ repo_nodes = extract_nodes(remote_repo_node, ["repository", "tag_or_hash"]) # Avoid cyclic imports from splitgraph.core.repository import Repository repository = Repository.from_schema(repo_nodes[0].match.group(0)) # See if we got given a tag / hash (the executor will try to interpret it as both). if len(repo_nodes) == 2: tag_or_hash = repo_nodes[1].match.group(0) else: tag_or_hash = "latest" return repository, tag_or_hash
def test_examples(local_engine_empty): # Test the example-generating commands used in the quickstart runner = CliRunner() result = runner.invoke(generate_c, ["example/repo_1"]) assert result.exit_code == 0 repo = Repository.from_schema("example/repo_1") assert len(repo.images()) == 2 assert repo.run_sql("SELECT COUNT(*) FROM demo", return_shape=ResultShape.ONE_ONE) == 10 assert repo.diff("demo", repo.head, None, aggregate=True) == (0, 0, 0) result = runner.invoke(alter_c, ["example/repo_1"]) assert result.exit_code == 0 assert len(repo.images()) == 2 assert repo.diff("demo", repo.head, None, aggregate=True) == (2, 2, 2) result = runner.invoke(splitfile_c, ["example/repo_1", "example/repo_2"]) assert result.exit_code == 0 assert "FROM example/repo_1 IMPORT demo AS table_1" in result.stdout assert "FROM example/repo_2:${IMAGE_2} IMPORT demo AS table_2" in result.stdout
def test_singer_ingestion_delete_old_image(local_engine_empty): runner = CliRunner(mix_stderr=False) with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 with open(os.path.join(INGESTION_RESOURCES, "singer/update.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest", "--delete-old"], input=f, catch_exceptions=False) assert result.exit_code == 0 repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 1
def test_singer_tap_mysql_sync(local_engine_empty): source = _source(local_engine_empty) repo = Repository.from_schema(TEST_REPO) source.sync(repo, "latest") assert len(repo.images()) == 1 image = repo.images["latest"] assert sorted(image.get_tables()) == ["_sg_ingestion_state", "mushrooms"] image.checkout() assert repo.run_sql( "SELECT * FROM mushrooms ORDER BY mushroom_id ASC") == [ (datetime(2012, 11, 11, 8, 6, 26), True, 1, "portobello"), (datetime(2018, 3, 17, 8, 6, 26), False, 2, "deathcap"), ] assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == { "bookmarks": { "mysqlschema-mushrooms": { "replication_key": "mushroom_id", "replication_key_value": 2, "version": mock.ANY, } }, "currently_syncing": None, } assert image.get_table("mushrooms").objects == [ "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3" ] # Run replication one more time -- check that we didn't add any more rows source.sync(repo, "latest") assert len(repo.images()) == 1 image = repo.images["latest"] assert image.get_table("mushrooms").objects == [ "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3", # TODO: this object has the pk=2 row from the previous one repeated, a tap-mysql bug # but we don't conflate these with Singer now. "od487f26d32a347ae4cc81a7442ef5a28615f70a9fff426991ab0d9d14bf7aa", ]
def test_singer_tap_mysql_sync(local_engine_empty): source = _source(local_engine_empty) repo = Repository.from_schema(TEST_REPO) source.sync(repo, "latest") assert len(repo.images()) == 1 image = repo.images["latest"] assert sorted(image.get_tables()) == ["_sg_ingestion_state", "mushrooms"] image.checkout() assert repo.run_sql( "SELECT * FROM mushrooms ORDER BY mushroom_id ASC") == [ (datetime(2012, 11, 11, 8, 6, 26), True, 1, "portobello"), (datetime(2018, 3, 17, 8, 6, 26), False, 2, "deathcap"), ] assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == { "bookmarks": { "mysqlschema-mushrooms": { "replication_key": "mushroom_id", "replication_key_value": 2, "version": mock.ANY, } }, "currently_syncing": None, } assert image.get_table("mushrooms").objects == [ "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3" ] # Run replication one more time -- check that we didn't add any more rows source.sync(repo, "latest") assert len(repo.images()) == 1 image = repo.images["latest"] assert image.get_table("mushrooms").objects == [ "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3" ]
def execute_commands( commands: str, params: Optional[Dict[str, str]] = None, output: Optional[Repository] = None, output_base: str = "0" * 32, ) -> None: """ Executes a series of Splitfile commands. :param commands: A string with the raw Splitfile. :param params: A dictionary of parameters to be applied to the Splitfile (`${PARAM}` is replaced with the specified parameter value). :param output: Output repository to execute the Splitfile against. :param output_base: If not None, a revision that gets checked out for all Splitfile actions to be committed on top of it. """ if params is None: params = {} if output and repository_exists(output) and output_base is not None: output.images.by_hash(output_base).checkout() # Use a random target schema if unspecified. output = output or Repository.from_schema("output_%0.2x" % getrandbits(16)) # Don't initialize the output until a command writing to it asks us to # (otherwise we might have a FROM ... AS output_name change it). repo_created = False def _initialize_output(output): if not repository_exists(output): nonlocal repo_created output.init() repo_created = True from splitgraph.commandline.common import Color node_list = parse_commands(commands, params=params) # Record the internal structure of commands used to create the final image. provenance: List[ProvenanceLine] = [] try: for i, node in enumerate(node_list): logging.info( Color.BOLD + "\nStep %d/%d : %s" % (i + 1, len(node_list), truncate_line(node.text, length=60)) + Color.END) if node.expr_name == "from": output, maybe_provenance_line = _execute_from(node, output) if maybe_provenance_line: provenance.append(maybe_provenance_line) elif node.expr_name == "import": _initialize_output(output) provenance_line = _execute_import(node, output) provenance.append(provenance_line) elif node.expr_name == "sql" or node.expr_name == "sql_file": _initialize_output(output) provenance_line = _execute_sql(node, output) provenance.append(provenance_line) elif node.expr_name == "custom": _initialize_output(output) provenance_line = _execute_custom(node, output) provenance.append(provenance_line) final_image = output.head_strict final_image.set_provenance(provenance) get_engine().commit() logging.info("Successfully built %s:%s." % (str(output), final_image.image_hash[:12])) except Exception: if repo_created and len(output.images()) == 1: # As a corner case, if we created a repository and there's been # a failure running the Splitfile (on the first command), we delete the dummy # 0000... image and the rest of the repository as part of cleanup. output.delete() get_engine().rollback() raise
from datetime import datetime as dt import pytest from test.splitgraph.conftest import _mount_postgres, _mount_mysql, _mount_mongo from splitgraph.core.repository import Repository from splitgraph.core.types import TableColumn from splitgraph.engine import get_engine from splitgraph.hooks.mount_handlers import mount PG_MNT = Repository.from_schema("test/pg_mount") MG_MNT = Repository.from_schema("test_mg_mount") MYSQL_MNT = Repository.from_schema("test/mysql_mount") @pytest.mark.mounting def test_mount_unmount(local_engine_empty): _mount_postgres(PG_MNT) assert (1, "apple") in get_engine().run_sql("""SELECT * FROM "test/pg_mount".fruits""") PG_MNT.delete() assert not get_engine().schema_exists(PG_MNT.to_schema()) @pytest.mark.mounting def test_mount_partial(local_engine_empty): _mount_postgres(PG_MNT, tables=["fruits"]) assert get_engine().table_exists(PG_MNT.to_schema(), "fruits") assert not get_engine().table_exists(PG_MNT.to_schema(), "vegetables") @pytest.mark.mounting
def test_singer_data_source_sync(local_engine_empty): source = GenericSingerDataSource( local_engine_empty, credentials={"some": "credential"}, params={ "tap_path": TEST_TAP, "other": "param" }, ) repo = Repository.from_schema(TEST_REPO) source.sync(repo, "latest") assert len(repo.images()) == 1 image = repo.images["latest"] assert sorted(image.get_tables()) == [ "_sg_ingestion_state", "releases", "stargazers" ] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 5 assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == { "bookmarks": { "splitgraph/splitgraph": { "stargazers": { "since": "2020-10-14T11:06:40.852311Z" }, "releases": { "since": "2020-10-14T11:06:40.852311Z" }, } } } # Second sync source.sync(repo, "latest") assert len(repo.images()) == 1 image = repo.images["latest"] assert sorted(image.get_tables()) == [ "_sg_ingestion_state", "releases", "stargazers" ] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 9 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == { "bookmarks": { "splitgraph/splitgraph": { "releases": { "since": "2020-10-14T11:06:42.786589Z" }, "stargazers": { "since": "2020-10-14T11:06:42.565793Z" }, } } }
def test_singer_ingestion_update(local_engine_empty): # Run the initial ingestion and then a repeat ingestion with a few rows getting updated # (check that a record with the same PK but a different value gets picked up as a diff), # a few inserted and one inserted that hasn't changed (check it's not saved in the diff). runner = CliRunner(mix_stderr=False) with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 with open(os.path.join(INGESTION_RESOURCES, "singer/update.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 assert json.loads(result.stdout) == { "bookmarks": { "splitgraph/splitgraph": { "releases": { "since": "2020-10-14T11:06:42.786589Z" }, "stargazers": { "since": "2020-10-14T11:06:42.565793Z" }, } } } repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 2 image = repo.images["latest"] assert sorted(image.get_tables()) == ["releases", "stargazers"] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 9 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql( "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [ (Decimal("100001"), datetime(2018, 10, 17, 22, 14, 12)), (Decimal("100002"), datetime(2018, 11, 6, 11, 26, 16)), (Decimal("100003"), datetime(2018, 12, 11, 16, 0, 42)), (Decimal("100004"), datetime(2020, 10, 11, 21, 9, 30)), (Decimal("100005"), datetime(2019, 4, 18, 2, 40, 47)), (Decimal("100006"), datetime(2019, 6, 6, 20, 53)), ] assert image.get_table("releases").table_schema == _RELEASES_SCHEMA assert image.get_table("releases").objects == [ "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a", "ocf91fc59f89f9db3db9aea28c4719f8bd009b13990f3f12f93f282618d81a8", ] assert image.get_table("stargazers").table_schema == _STARGAZERS_SCHEMA # Extra DIFF at the end assert image.get_table("stargazers").objects == [ "od68e932ebc99c1a337363c1b92056dcf7fc7c6c45494bc42e1e1ec4e0c88ac", "oc61804b31dcae8294a6b780efe41601eaeb7a1d0b7cd7bdfea4843db214df0", ] assert repo.run_sql( "SELECT sg_ud_flag, user_id, starred_at " "FROM splitgraph_meta.oc61804b31dcae8294a6b780efe41601eaeb7a1d0b7cd7bdfea4843db214df0 " "ORDER BY user_id") == [ (True, Decimal("100004"), datetime(2020, 10, 11, 21, 9, 30)), (True, Decimal("100006"), datetime(2019, 6, 6, 20, 53)), ]
def test_singer_ingestion_schema_change(local_engine_empty): # Run the initial ingestion and then another one where we've changed the user_id in # stargazers to be a string. runner = CliRunner(mix_stderr=False) with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 with open(os.path.join(INGESTION_RESOURCES, "singer/schema_change.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 assert json.loads(result.stdout) == { "bookmarks": { "splitgraph/splitgraph": { "stargazers": { "since": "2020-10-14T11:06:42.565793Z" }, } } } repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 2 image = repo.images["latest"] assert sorted(image.get_tables()) == ["releases", "stargazers"] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql( "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [ ("100001", datetime(2018, 10, 17, 22, 14, 12)), ("100002", datetime(2018, 11, 6, 11, 26, 16)), ("100003", datetime(2018, 12, 11, 16, 0, 42)), ("100004", datetime(2020, 10, 11, 21, 9, 30)), ("100005", datetime(2019, 4, 18, 2, 40, 47)), ("string_user_id", datetime(2019, 4, 18, 2, 40, 47)), ] # Releases unchanged -- same table assert image.get_table("releases").table_schema == _RELEASES_SCHEMA assert image.get_table("releases").objects == [ "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a" ] assert image.get_table("stargazers").table_schema == [ TableColumn( ordinal=0, name="_sdc_repository", pg_type="character varying", is_pk=False, comment=None, ), TableColumn( ordinal=1, name="starred_at", pg_type="timestamp without time zone", is_pk=False, comment=None, ), TableColumn(ordinal=2, name="user", pg_type="jsonb", is_pk=False, comment=None), TableColumn(ordinal=3, name="user_id", pg_type="character varying", is_pk=True, comment=None), ] # Stargazers: had a migration, new object assert image.get_table("stargazers").objects == [ "o9e54958076c86d854ad21da17239daecaec839e84daee8ff9ca5dcecd84cdd" ]