コード例 #1
0
def test_import_splitfile_reuses_hash(local_engine_empty):
    # Create two repositories and run the same Splitfile that loads some data from a mounted database.
    # Check that the same contents result in the same hash and no extra objects being created
    output_2 = Repository.from_schema("output_2")

    execute_commands(load_splitfile("import_from_mounted_db.splitfile"),
                     output=OUTPUT)
    execute_commands(load_splitfile("import_from_mounted_db.splitfile"),
                     output=output_2)

    head = OUTPUT.head
    assert head.get_table("my_fruits").objects == [
        "o71ba35a5bbf8ac7779d8fe32226aaacc298773e154a4f84e9aabf829238fb1"
    ]
    assert head.get_table("o_vegetables").objects == [
        "o70e726f4bf18547242722600c4723dceaaede27db8fa5e9e6d7ec39187dd86"
    ]
    assert head.get_table("vegetables").objects == [
        "ob474d04a80c611fc043e8303517ac168444dc7518af60e4ccc56b3b0986470"
    ]
    assert head.get_table("all_fruits").objects == [
        "o0e742bd2ea4927f5193a2c68f8d4c51ea018b1ef3e3005a50727147d2cf57b"
    ]

    head_2 = output_2.head
    assert head_2.get_table("my_fruits").objects == head.get_table(
        "my_fruits").objects
    assert head_2.get_table("o_vegetables").objects == head.get_table(
        "o_vegetables").objects
    assert head_2.get_table("vegetables").objects == head.get_table(
        "vegetables").objects
    assert head_2.get_table("all_fruits").objects == head.get_table(
        "all_fruits").objects
コード例 #2
0
ファイル: engine.py プロジェクト: yanyu510/splitgraph
def lookup_repository(name: str, include_local: bool = False) -> "Repository":
    """
    Queries the SG engines on the lookup path to locate one hosting the given repository.

    :param name: Repository name
    :param include_local: If True, also queries the local engine

    :return: Local or remote Repository object
    """
    from splitgraph.core.repository import Repository

    template = Repository.from_schema(name)

    if name in _LOOKUP_PATH_OVERRIDE:
        return Repository(
            template.namespace, template.repository, get_engine(_LOOKUP_PATH_OVERRIDE[name])
        )

    # Currently just check if the schema with that name exists on the remote.
    if include_local and repository_exists(template):
        return template

    for engine in _LOOKUP_PATH:
        candidate = Repository(template.namespace, template.repository, get_engine(engine))
        if repository_exists(candidate):
            return candidate
        candidate.engine.close()

    raise RepositoryNotFoundError("Unknown repository %s!" % name)
コード例 #3
0
ファイル: splitfile.py プロジェクト: yanyu510/splitgraph
def build_c(splitfile, args, output_repository):
    """
    Build Splitgraph images.

    This executes a Splitfile, building a new image or checking it out from cache if the same
    image had already been built.

    Examples:

    ``sgr build my.splitfile``

    Executes ``my.splitfile`` and writes its output into a new repository with the same name
    as the Splitfile (my) unless the name is specified in the Splitfile.

    ``sgr build my.splitfile -o mynew/repo``

    Executes ``my.splitfile`` and writes its output into ``mynew/repo``.

    ``sgr build my_other.splitfile -o mynew/otherrepo --args PARAM1 VAL1 --args PARAM2 VAL2``

    Executes ``my_other.splitfile`` with parameters ``PARAM1`` and ``PARAM2`` set to
    ``VAL1`` and  ``VAL2``, respectively.
    """
    from splitgraph.splitfile import execute_commands
    from splitgraph.core.repository import Repository

    args = {k: v for k, v in args}
    click.echo("Executing Splitfile %s with arguments %r" %
               (splitfile.name, args))

    if output_repository is None:
        file_name = os.path.splitext(os.path.basename(splitfile.name))[0]
        output_repository = Repository.from_schema(file_name)

    execute_commands(splitfile.read(), args, output=output_repository)
コード例 #4
0
ファイル: execution.py プロジェクト: yanyu510/splitgraph
def _execute_db_import(conn_string, fdw_name, fdw_params, table_names,
                       target_mountpoint, table_aliases,
                       table_queries) -> ProvenanceLine:
    mount_handler = get_mount_handler(fdw_name)
    tmp_mountpoint = Repository.from_schema(fdw_name + "_tmp_staging")
    tmp_mountpoint.delete()
    try:
        handler_kwargs = json.loads(fdw_params)
        handler_kwargs.update(
            conn_string_to_dict(conn_string.group() if conn_string else None))
        mount_handler(tmp_mountpoint.to_schema(), **handler_kwargs)
        # The foreign database is a moving target, so the new image hash is random.
        # Maybe in the future, when the object hash is a function of its contents, we can be smarter here...
        target_hash = "{:064x}".format(getrandbits(256))
        target_mountpoint.import_tables(
            table_aliases,
            tmp_mountpoint,
            table_names,
            target_hash=target_hash,
            foreign_tables=True,
            table_queries=table_queries,
        )
        return {"type": "MOUNT"}
    finally:
        tmp_mountpoint.delete()
コード例 #5
0
def test_singer_ingestion_errors(local_engine_empty):
    runner = CliRunner(mix_stderr=False)

    with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    # Default strategy: delete image on failure
    with open(os.path.join(INGESTION_RESOURCES, "singer/wrong_schema.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=True)

    assert result.exit_code == 1
    assert isinstance(result.exception, psycopg2.errors.InvalidDatetimeFormat)
    repo = Repository.from_schema(TEST_REPO)
    assert len(repo.images()) == 1

    # Keep new image
    with open(os.path.join(INGESTION_RESOURCES, "singer/wrong_schema.json"),
              "r") as f:
        result = runner.invoke(
            singer_target,
            [TEST_REPO + ":latest", "--failure=keep-both"],
            input=f,
            catch_exceptions=True,
        )

    assert result.exit_code == 1
    assert isinstance(result.exception, psycopg2.errors.InvalidDatetimeFormat)
    repo = Repository.from_schema(TEST_REPO)
    assert len(repo.images()) == 2

    # The "stargazers" table is still the same but the "releases" table managed to get updated.
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["releases", "stargazers"]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 7
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 5
コード例 #6
0
def test_push_target(
    repository,
    remote_repository,
    remote,
    available_remotes,
    upstream,
    expected_target,
    expected_remote,
):

    repository = Repository.from_schema(repository)
    remote_repository = Repository.from_schema(
        remote_repository) if remote_repository else None

    fake_config = {
        "remotes": {s: {
            "SG_NAMESPACE": "user"
        }
                    for s in available_remotes}
    }

    with mock.patch.object(Repository, "upstream",
                           new_callable=PropertyMock) as up:
        up.return_value = upstream
        with mock.patch("splitgraph.commandline.push_pull.REMOTES",
                        available_remotes):
            with mock.patch("splitgraph.commandline.push_pull.CONFIG",
                            fake_config):
                with mock.patch("splitgraph.engine.get_engine") as ge:
                    ge.return_value = Mock()
                    ge.return_value.name = expected_remote

                    if isinstance(expected_target, type):
                        with pytest.raises(expected_target):
                            _determine_push_target(repository,
                                                   remote_repository, remote)
                    else:
                        result = _determine_push_target(
                            repository, remote_repository, remote)
                        if upstream:
                            assert result == upstream
                        else:
                            assert result.to_schema() == expected_target

                            ge_call = ge.mock_calls[0]
                            assert ge_call[1][0] == expected_remote
                            assert result.engine.name == expected_remote
コード例 #7
0
def ingestion_test_repo():
    repo = Repository.from_schema("test/ingestion")
    try:
        repo.delete()
        repo.objects.cleanup()
        repo.init()
        yield repo
    finally:
        repo.rollback_engines()
        repo.delete()
コード例 #8
0
def test_singer_ingestion_initial(local_engine_empty):
    # Initial ingestion: two tables (stargazers and releases) grabbed from the output of
    # tap-github, truncated (simulate table creation and insertion of some rows)

    runner = CliRunner(mix_stderr=False)

    with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0
    assert json.loads(result.stdout) == {
        "bookmarks": {
            "splitgraph/splitgraph": {
                "stargazers": {
                    "since": "2020-10-14T11:06:40.852311Z"
                },
                "releases": {
                    "since": "2020-10-14T11:06:40.852311Z"
                },
            }
        }
    }
    repo = Repository.from_schema(TEST_REPO)

    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["releases", "stargazers"]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 6
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 5

    assert repo.run_sql(
        "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [
            (Decimal("100001"), datetime(2018, 10, 17, 22, 14, 12)),
            (Decimal("100002"), datetime(2018, 11, 6, 11, 26, 16)),
            (Decimal("100003"), datetime(2018, 12, 11, 16, 0, 42)),
            (Decimal("100004"), datetime(2019, 2, 18, 8, 14, 21)),
            (Decimal("100005"), datetime(2019, 4, 18, 2, 40, 47)),
        ]

    assert image.get_table("releases").table_schema == _RELEASES_SCHEMA
    assert image.get_table("releases").objects == [
        "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a"
    ]

    assert image.get_table("stargazers").table_schema == _STARGAZERS_SCHEMA
    assert image.get_table("stargazers").objects == [
        "od68e932ebc99c1a337363c1b92056dcf7fc7c6c45494bc42e1e1ec4e0c88ac"
    ]
コード例 #9
0
    def convert(self, value: str, param: Optional[Parameter],
                ctx: Optional[Context]) -> "Repository":
        from splitgraph.core.repository import Repository

        result = Repository.from_schema(value)
        if self.exists:
            from splitgraph.core.engine import repository_exists

            if not repository_exists(result):
                raise RepositoryNotFoundError("Unknown repository %s" % result)
        return result
コード例 #10
0
ファイル: output.py プロジェクト: dazzag24/splitgraph
def parse_repo_tag_or_hash(value, default="latest"):
    repo_image = value.split(":")
    tag_or_hash: Optional[str]
    if len(repo_image) == 2:
        tag_or_hash = repo_image[1]
    else:
        tag_or_hash = default
    from splitgraph.core.repository import Repository

    repo = Repository.from_schema(repo_image[0])
    return repo, tag_or_hash
コード例 #11
0
ファイル: execution.py プロジェクト: yanyu510/splitgraph
def _execute_from(
        node: Node,
        output: Repository) -> Tuple[Repository, Optional[ProvenanceLine]]:
    interesting_nodes = extract_nodes(node, ["repo_source", "repository"])
    repo_source = get_first_or_none(interesting_nodes, "repo_source")
    output_node = get_first_or_none(interesting_nodes, "repository")
    provenance: Optional[ProvenanceLine] = None

    if output_node:
        # AS (output) detected, change the current output repository to it.
        output = Repository.from_schema(output_node.match.group(0))
        logging.info("Changed output repository to %s" % str(output))

        # NB this destroys all data in the case where we ran some commands in the Splitfile and then
        # did FROM (...) without AS repository
        if repository_exists(output):
            logging.info("Clearing all output from %s" % str(output))
            output.delete()
    if not repository_exists(output):
        output.init()
    if repo_source:
        repository, tag_or_hash = parse_image_spec(repo_source)
        source_repo = lookup_repository(repository.to_schema(),
                                        include_local=True)

        if source_repo.engine.name == "LOCAL":
            # For local repositories, make sure to update them if they've an upstream
            if source_repo.upstream:
                source_repo.pull()

        # Get the target image hash from the source repo: otherwise, if the tag is, say, 'latest' and
        # the output has just had the base commit (000...) created in it, that commit will be the latest.
        clone(source_repo, local_repository=output, download_all=False)
        source_hash = source_repo.images[tag_or_hash].image_hash
        output.images.by_hash(source_hash).checkout()
        provenance = {
            "type": "FROM",
            "source_namespace": source_repo.namespace,
            "source": source_repo.repository,
            "source_hash": source_hash,
        }
    else:
        # FROM EMPTY AS repository -- initializes an empty repository (say to create a table or import
        # the results of a previous stage in a multistage build.
        # In this case, if AS repository has been specified, it's already been initialized. If not, this command
        # literally does nothing
        if not output_node:
            raise SplitfileError(
                "FROM EMPTY without AS (repository) does nothing!")
    return output, provenance
コード例 #12
0
ファイル: test_common.py プロジェクト: splitgraph/splitgraph
def test_syncable_data_source(local_engine_empty):
    source = IngestionTestSource(engine=local_engine_empty,
                                 credentials={},
                                 params={})

    # Initial sync
    repo = Repository.from_schema(TEST_REPO)
    repo.init()

    image_hash_1 = source.sync(repo, "latest")

    assert len(repo.images()) == 2
    image = repo.images[image_hash_1]
    assert sorted(image.get_tables()) == ["_sg_ingestion_state", "test_table"]
    image.checkout()

    assert repo.run_sql("SELECT * FROM test_table") == [(1, "one")]
    assert _get_state(repo) == {"last_value": 1}

    # Load the data anew into a different image
    repo.images["0" * 64].checkout()
    source._load(repo.to_schema())
    repo.commit_engines()

    assert repo.run_sql("SELECT * FROM test_table") == [(1, "one")]

    repo.uncheckout(force=True)
    # Perform a sync based on the empty image
    image_hash_2 = source.sync(repo, "0" * 64)
    assert image_hash_2 != image_hash_1

    image = repo.images[image_hash_2]
    assert sorted(image.get_tables()) == ["_sg_ingestion_state", "test_table"]
    image.checkout()

    assert repo.run_sql("SELECT * FROM test_table") == [(1, "one")]
    assert _get_state(repo) == {"last_value": 1}

    # Perform a sync based on the ingested image
    image_hash_3 = source.sync(repo, image_hash_1)
    assert image_hash_3 != image_hash_1

    image = repo.images[image_hash_3]
    assert sorted(image.get_tables()) == ["_sg_ingestion_state", "test_table"]
    image.checkout()

    assert repo.run_sql("SELECT * FROM test_table ORDER BY key ASC") == [
        (1, "one"), (2, "two")
    ]
    assert _get_state(repo) == {"last_value": 2}
コード例 #13
0
def parse_image_spec(remote_repo_node: Node) -> Tuple["Repository", str]:
    """
    Extracts the image specification (e.g. noaa/climate:abcdef123 -> Repository('noaa', 'climate'), 'abcdef123')
    :param remote_repo_node: Parse node with the specification
    :return: Tuple of (repository object, tag or hash)
    """
    repo_nodes = extract_nodes(remote_repo_node, ["repository", "tag_or_hash"])
    # Avoid cyclic imports
    from splitgraph.core.repository import Repository

    repository = Repository.from_schema(repo_nodes[0].match.group(0))
    # See if we got given a tag / hash (the executor will try to interpret it as both).
    if len(repo_nodes) == 2:
        tag_or_hash = repo_nodes[1].match.group(0)
    else:
        tag_or_hash = "latest"
    return repository, tag_or_hash
コード例 #14
0
def test_examples(local_engine_empty):
    # Test the example-generating commands used in the quickstart

    runner = CliRunner()
    result = runner.invoke(generate_c, ["example/repo_1"])
    assert result.exit_code == 0

    repo = Repository.from_schema("example/repo_1")
    assert len(repo.images()) == 2
    assert repo.run_sql("SELECT COUNT(*) FROM demo", return_shape=ResultShape.ONE_ONE) == 10
    assert repo.diff("demo", repo.head, None, aggregate=True) == (0, 0, 0)

    result = runner.invoke(alter_c, ["example/repo_1"])
    assert result.exit_code == 0
    assert len(repo.images()) == 2
    assert repo.diff("demo", repo.head, None, aggregate=True) == (2, 2, 2)

    result = runner.invoke(splitfile_c, ["example/repo_1", "example/repo_2"])
    assert result.exit_code == 0
    assert "FROM example/repo_1 IMPORT demo AS table_1" in result.stdout
    assert "FROM example/repo_2:${IMAGE_2} IMPORT demo AS table_2" in result.stdout
コード例 #15
0
def test_singer_ingestion_delete_old_image(local_engine_empty):
    runner = CliRunner(mix_stderr=False)

    with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    with open(os.path.join(INGESTION_RESOURCES, "singer/update.json"),
              "r") as f:
        result = runner.invoke(singer_target,
                               [TEST_REPO + ":latest", "--delete-old"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0
    repo = Repository.from_schema(TEST_REPO)
    assert len(repo.images()) == 1
コード例 #16
0
def test_singer_tap_mysql_sync(local_engine_empty):
    source = _source(local_engine_empty)
    repo = Repository.from_schema(TEST_REPO)

    source.sync(repo, "latest")

    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["_sg_ingestion_state", "mushrooms"]
    image.checkout()

    assert repo.run_sql(
        "SELECT * FROM mushrooms ORDER BY mushroom_id ASC") == [
            (datetime(2012, 11, 11, 8, 6, 26), True, 1, "portobello"),
            (datetime(2018, 3, 17, 8, 6, 26), False, 2, "deathcap"),
        ]
    assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == {
        "bookmarks": {
            "mysqlschema-mushrooms": {
                "replication_key": "mushroom_id",
                "replication_key_value": 2,
                "version": mock.ANY,
            }
        },
        "currently_syncing": None,
    }
    assert image.get_table("mushrooms").objects == [
        "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3"
    ]

    # Run replication one more time -- check that we didn't add any more rows
    source.sync(repo, "latest")
    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert image.get_table("mushrooms").objects == [
        "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3",
        # TODO: this object has the pk=2 row from the previous one repeated, a tap-mysql bug
        #  but we don't conflate these with Singer now.
        "od487f26d32a347ae4cc81a7442ef5a28615f70a9fff426991ab0d9d14bf7aa",
    ]
コード例 #17
0
def test_singer_tap_mysql_sync(local_engine_empty):
    source = _source(local_engine_empty)
    repo = Repository.from_schema(TEST_REPO)

    source.sync(repo, "latest")

    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["_sg_ingestion_state", "mushrooms"]
    image.checkout()

    assert repo.run_sql(
        "SELECT * FROM mushrooms ORDER BY mushroom_id ASC") == [
            (datetime(2012, 11, 11, 8, 6, 26), True, 1, "portobello"),
            (datetime(2018, 3, 17, 8, 6, 26), False, 2, "deathcap"),
        ]
    assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == {
        "bookmarks": {
            "mysqlschema-mushrooms": {
                "replication_key": "mushroom_id",
                "replication_key_value": 2,
                "version": mock.ANY,
            }
        },
        "currently_syncing": None,
    }
    assert image.get_table("mushrooms").objects == [
        "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3"
    ]

    # Run replication one more time -- check that we didn't add any more rows
    source.sync(repo, "latest")
    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert image.get_table("mushrooms").objects == [
        "o69e4529709af65f37f2e2f3a8290340ae7ad9ada6bca9c393a09572f12cbb3"
    ]
コード例 #18
0
ファイル: execution.py プロジェクト: yanyu510/splitgraph
def execute_commands(
    commands: str,
    params: Optional[Dict[str, str]] = None,
    output: Optional[Repository] = None,
    output_base: str = "0" * 32,
) -> None:
    """
    Executes a series of Splitfile commands.

    :param commands: A string with the raw Splitfile.
    :param params: A dictionary of parameters to be applied to the Splitfile (`${PARAM}` is replaced with the specified
        parameter value).
    :param output: Output repository to execute the Splitfile against.
    :param output_base: If not None, a revision that gets checked out for all Splitfile actions to be committed
        on top of it.
    """
    if params is None:
        params = {}
    if output and repository_exists(output) and output_base is not None:
        output.images.by_hash(output_base).checkout()
    # Use a random target schema if unspecified.
    output = output or Repository.from_schema("output_%0.2x" % getrandbits(16))

    # Don't initialize the output until a command writing to it asks us to
    # (otherwise we might have a FROM ... AS output_name change it).
    repo_created = False

    def _initialize_output(output):
        if not repository_exists(output):
            nonlocal repo_created
            output.init()
            repo_created = True

    from splitgraph.commandline.common import Color

    node_list = parse_commands(commands, params=params)

    # Record the internal structure of commands used to create the final image.
    provenance: List[ProvenanceLine] = []

    try:
        for i, node in enumerate(node_list):
            logging.info(
                Color.BOLD + "\nStep %d/%d : %s" %
                (i + 1, len(node_list), truncate_line(node.text, length=60)) +
                Color.END)
            if node.expr_name == "from":
                output, maybe_provenance_line = _execute_from(node, output)
                if maybe_provenance_line:
                    provenance.append(maybe_provenance_line)

            elif node.expr_name == "import":
                _initialize_output(output)
                provenance_line = _execute_import(node, output)
                provenance.append(provenance_line)

            elif node.expr_name == "sql" or node.expr_name == "sql_file":
                _initialize_output(output)
                provenance_line = _execute_sql(node, output)
                provenance.append(provenance_line)

            elif node.expr_name == "custom":
                _initialize_output(output)
                provenance_line = _execute_custom(node, output)
                provenance.append(provenance_line)

        final_image = output.head_strict
        final_image.set_provenance(provenance)
        get_engine().commit()
        logging.info("Successfully built %s:%s." %
                     (str(output), final_image.image_hash[:12]))

    except Exception:
        if repo_created and len(output.images()) == 1:
            # As a corner case, if we created a repository and there's been
            # a failure running the Splitfile (on the first command), we delete the dummy
            # 0000... image and the rest of the repository as part of cleanup.
            output.delete()
        get_engine().rollback()
        raise
コード例 #19
0
ファイル: test_mounting.py プロジェクト: dazzag24/splitgraph
from datetime import datetime as dt

import pytest
from test.splitgraph.conftest import _mount_postgres, _mount_mysql, _mount_mongo

from splitgraph.core.repository import Repository
from splitgraph.core.types import TableColumn
from splitgraph.engine import get_engine
from splitgraph.hooks.mount_handlers import mount

PG_MNT = Repository.from_schema("test/pg_mount")
MG_MNT = Repository.from_schema("test_mg_mount")
MYSQL_MNT = Repository.from_schema("test/mysql_mount")


@pytest.mark.mounting
def test_mount_unmount(local_engine_empty):
    _mount_postgres(PG_MNT)
    assert (1, "apple") in get_engine().run_sql("""SELECT * FROM "test/pg_mount".fruits""")
    PG_MNT.delete()
    assert not get_engine().schema_exists(PG_MNT.to_schema())


@pytest.mark.mounting
def test_mount_partial(local_engine_empty):
    _mount_postgres(PG_MNT, tables=["fruits"])
    assert get_engine().table_exists(PG_MNT.to_schema(), "fruits")
    assert not get_engine().table_exists(PG_MNT.to_schema(), "vegetables")


@pytest.mark.mounting
コード例 #20
0
def test_singer_data_source_sync(local_engine_empty):
    source = GenericSingerDataSource(
        local_engine_empty,
        credentials={"some": "credential"},
        params={
            "tap_path": TEST_TAP,
            "other": "param"
        },
    )

    repo = Repository.from_schema(TEST_REPO)
    source.sync(repo, "latest")

    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == [
        "_sg_ingestion_state", "releases", "stargazers"
    ]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 6
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 5

    assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == {
        "bookmarks": {
            "splitgraph/splitgraph": {
                "stargazers": {
                    "since": "2020-10-14T11:06:40.852311Z"
                },
                "releases": {
                    "since": "2020-10-14T11:06:40.852311Z"
                },
            }
        }
    }

    # Second sync
    source.sync(repo, "latest")
    assert len(repo.images()) == 1
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == [
        "_sg_ingestion_state", "releases", "stargazers"
    ]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 9
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 6

    assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == {
        "bookmarks": {
            "splitgraph/splitgraph": {
                "releases": {
                    "since": "2020-10-14T11:06:42.786589Z"
                },
                "stargazers": {
                    "since": "2020-10-14T11:06:42.565793Z"
                },
            }
        }
    }
コード例 #21
0
def test_singer_ingestion_update(local_engine_empty):
    # Run the initial ingestion and then a repeat ingestion with a few rows getting updated
    # (check that a record with the same PK but a different value gets picked up as a diff),
    # a few inserted and one inserted that hasn't changed (check it's not saved in the diff).
    runner = CliRunner(mix_stderr=False)

    with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    with open(os.path.join(INGESTION_RESOURCES, "singer/update.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    assert json.loads(result.stdout) == {
        "bookmarks": {
            "splitgraph/splitgraph": {
                "releases": {
                    "since": "2020-10-14T11:06:42.786589Z"
                },
                "stargazers": {
                    "since": "2020-10-14T11:06:42.565793Z"
                },
            }
        }
    }
    repo = Repository.from_schema(TEST_REPO)

    assert len(repo.images()) == 2
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["releases", "stargazers"]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 9
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 6

    assert repo.run_sql(
        "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [
            (Decimal("100001"), datetime(2018, 10, 17, 22, 14, 12)),
            (Decimal("100002"), datetime(2018, 11, 6, 11, 26, 16)),
            (Decimal("100003"), datetime(2018, 12, 11, 16, 0, 42)),
            (Decimal("100004"), datetime(2020, 10, 11, 21, 9, 30)),
            (Decimal("100005"), datetime(2019, 4, 18, 2, 40, 47)),
            (Decimal("100006"), datetime(2019, 6, 6, 20, 53)),
        ]

    assert image.get_table("releases").table_schema == _RELEASES_SCHEMA
    assert image.get_table("releases").objects == [
        "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a",
        "ocf91fc59f89f9db3db9aea28c4719f8bd009b13990f3f12f93f282618d81a8",
    ]

    assert image.get_table("stargazers").table_schema == _STARGAZERS_SCHEMA
    # Extra DIFF at the end
    assert image.get_table("stargazers").objects == [
        "od68e932ebc99c1a337363c1b92056dcf7fc7c6c45494bc42e1e1ec4e0c88ac",
        "oc61804b31dcae8294a6b780efe41601eaeb7a1d0b7cd7bdfea4843db214df0",
    ]

    assert repo.run_sql(
        "SELECT sg_ud_flag, user_id, starred_at "
        "FROM splitgraph_meta.oc61804b31dcae8294a6b780efe41601eaeb7a1d0b7cd7bdfea4843db214df0 "
        "ORDER BY user_id") == [
            (True, Decimal("100004"), datetime(2020, 10, 11, 21, 9, 30)),
            (True, Decimal("100006"), datetime(2019, 6, 6, 20, 53)),
        ]
コード例 #22
0
def test_singer_ingestion_schema_change(local_engine_empty):
    # Run the initial ingestion and then another one where we've changed the user_id in
    # stargazers to be a string.

    runner = CliRunner(mix_stderr=False)

    with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    with open(os.path.join(INGESTION_RESOURCES, "singer/schema_change.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    assert json.loads(result.stdout) == {
        "bookmarks": {
            "splitgraph/splitgraph": {
                "stargazers": {
                    "since": "2020-10-14T11:06:42.565793Z"
                },
            }
        }
    }
    repo = Repository.from_schema(TEST_REPO)

    assert len(repo.images()) == 2
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["releases", "stargazers"]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 6
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 6

    assert repo.run_sql(
        "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [
            ("100001", datetime(2018, 10, 17, 22, 14, 12)),
            ("100002", datetime(2018, 11, 6, 11, 26, 16)),
            ("100003", datetime(2018, 12, 11, 16, 0, 42)),
            ("100004", datetime(2020, 10, 11, 21, 9, 30)),
            ("100005", datetime(2019, 4, 18, 2, 40, 47)),
            ("string_user_id", datetime(2019, 4, 18, 2, 40, 47)),
        ]

    # Releases unchanged -- same table
    assert image.get_table("releases").table_schema == _RELEASES_SCHEMA
    assert image.get_table("releases").objects == [
        "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a"
    ]

    assert image.get_table("stargazers").table_schema == [
        TableColumn(
            ordinal=0,
            name="_sdc_repository",
            pg_type="character varying",
            is_pk=False,
            comment=None,
        ),
        TableColumn(
            ordinal=1,
            name="starred_at",
            pg_type="timestamp without time zone",
            is_pk=False,
            comment=None,
        ),
        TableColumn(ordinal=2,
                    name="user",
                    pg_type="jsonb",
                    is_pk=False,
                    comment=None),
        TableColumn(ordinal=3,
                    name="user_id",
                    pg_type="character varying",
                    is_pk=True,
                    comment=None),
    ]

    # Stargazers: had a migration, new object
    assert image.get_table("stargazers").objects == [
        "o9e54958076c86d854ad21da17239daecaec839e84daee8ff9ca5dcecd84cdd"
    ]