Ejemplo n.º 1
0
def test_pandas_update_patch(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()

    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()

    old.checkout()
    new.checkout()

    assert ingestion_test_repo.run_sql(
        "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id"
    ) == [
        (1, dt(2018, 1, 1, 0, 11, 11), "apple"),
        (2, dt(2018, 1, 2, 0, 22, 22), "orange"),
        (3, dt(2018, 12, 31, 23, 59, 49), "mayonnaise"),
        (4, dt(2018, 12, 30, 0, 0), "chandelier"),
    ]

    assert sorted(ingestion_test_repo.diff("test_table", old, new)) == [
        (False, (3, dt(2018, 1, 3, 0, 33, 33), "mayonnaise")),
        (False, (4, dt(2018, 1, 4, 0, 44, 44), "mustard")),
        (True, (3, dt(2018, 12, 31, 23, 59, 49), "mayonnaise")),
        (True, (4, dt(2018, 12, 30, 0, 0), "chandelier")),
    ]
Ejemplo n.º 2
0
def test_pandas_read_basic(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()

    # We currently don't detect the index column name since it's an arbitrary query that's passed.
    output = sql_to_df("SELECT * FROM test_table",
                       repository=ingestion_test_repo,
                       index_col="fruit_id")

    assert_frame_equal(base_df, output)
Ejemplo n.º 3
0
def test_pandas_update_type_changes_weaker(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    ingestion_test_repo.commit()
    altered_df = upd_df_1.copy()
    altered_df["name"] = [4, 5, 6]
    altered_df["name"] = altered_df["name"].astype(int)

    # Type changes are passed through to Postgres to see if it can coerce them -- in this case
    # 'name' remains a string, so a string '4' is written for fruit_id = 2.
    df_to_table(altered_df, ingestion_test_repo, "test_table", if_exists="patch")
    assert ingestion_test_repo.run_sql("SELECT name FROM test_table WHERE fruit_id = 2") == [("4",)]
Ejemplo n.º 4
0
def build_repo():
    repo = Repository(namespace="abc", repository="1234")
    repo.delete()
    repo.init()
    df_to_table(fake_data(8),
                repository=repo,
                table="unit_test",
                if_exists='replace')
    new_img = repo.commit()
    new_img.checkout()

    return repo
Ejemplo n.º 5
0
def test_pandas_update_type_changes_stricter(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    ingestion_test_repo.commit()
    altered_df = upd_df_1.copy()
    altered_df.index = altered_df.index.map(str)

    # fruit_id (integer) is a string in this case -- Postgres should try to coerce
    # it back into integer to patch into the target table.

    df_to_table(altered_df, ingestion_test_repo, "test_table", if_exists="patch")
    assert ingestion_test_repo.run_sql("SELECT name FROM test_table WHERE fruit_id = 4") == [
        ("chandelier",)
    ]
Ejemplo n.º 6
0
def test_evil_pandas_dataframes(ingestion_test_repo):
    # Test corner cases we found in the real world
    df = _str_to_df(load_csv("evil_df.csv"), has_ts=False)
    df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch")

    assert ingestion_test_repo.run_sql(
        "SELECT id, job_title, some_number FROM test_table ORDER BY id"
    ) == [
        # Make sure backslashes don't break ingestion -- not exactly sure what the intention
        # in the original dataset was (job title is "PRESIDENT\").
        (1, "PRESIDENT\\", 25),
        # Test characters that can be used as separators still make it into fields
        (2, "\t", 26),
    ]
Ejemplo n.º 7
0
def test_pandas_kv(ingestion_test_repo):
    # Test reads and writes with a key-value type dataframe.
    df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df_kv.csv"),
                     index_col=0)
    df_to_table(df, ingestion_test_repo, "test_table")

    # Test patching works without specifying an index col
    df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "patch_df_kv.csv"))
    df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch")

    assert_frame_equal(
        sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo),
        pd.DataFrame(data=[(1, "banana"), (2, "kumquat"), (3, "pendulum")],
                     columns=["key", "value"]),
    )
Ejemplo n.º 8
0
def test_pandas_basic_insert(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    ingestion_test_repo.commit()

    assert ingestion_test_repo.head.get_table("test_table").table_schema == [
        TableColumn(1, "fruit_id", "bigint", True),
        TableColumn(2, "timestamp", "timestamp without time zone", False),
        TableColumn(3, "name", "text", False),
    ]

    assert ingestion_test_repo.run_sql(
        "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id"
    ) == [
        (1, dt(2018, 1, 1, 0, 11, 11), "apple"),
        (2, dt(2018, 1, 2, 0, 22, 22), "orange"),
        (3, dt(2018, 1, 3, 0, 33, 33), "mayonnaise"),
        (4, dt(2018, 1, 4, 0, 44, 44), "mustard"),
    ]
Ejemplo n.º 9
0
def test_pandas_read_other_checkout(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()
    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()

    # Record the second version of the table
    patched_df = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo)

    # Check out the old version but run the query against the new image -- new version should come out.
    old.checkout()
    output_1 = sql_to_df("SELECT * FROM test_table", image=new)
    assert_frame_equal(output_1, patched_df)

    # Test works with hashes
    output_2 = sql_to_df(
        "SELECT * FROM test_table", image=new.image_hash[:10], repository=ingestion_test_repo
    )
    assert_frame_equal(output_2, patched_df)
Ejemplo n.º 10
0
def test_pandas_no_processing_insert(ingestion_test_repo):
    # Make sure everything still works when we don't have a PK.
    df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df.csv"))
    df_to_table(df, ingestion_test_repo, "test_table")
    ingestion_test_repo.commit()

    assert ingestion_test_repo.head.get_table("test_table").table_schema == [
        TableColumn(1, "fruit_id", "bigint", False),
        TableColumn(2, "timestamp", "text", False),
        TableColumn(3, "name", "text", False),
    ]

    assert ingestion_test_repo.run_sql(
        "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id"
    ) == [
        (1, "2018-01-01 00:11:11", "apple"),
        (2, "2018-01-02 00:22:22", "orange"),
        (3, "2018-01-03 00:33:33", "mayonnaise"),
        (4, "2018-01-04 00:44:44", "mustard"),
    ]
Ejemplo n.º 11
0
def test_pandas_update_replace(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()

    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="replace")
    new = ingestion_test_repo.commit()

    old.checkout()
    new.checkout()

    assert ingestion_test_repo.run_sql(
        "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id"
    ) == [
        (2, dt(2018, 1, 2, 0, 22, 22), "orange"),
        (3, dt(2018, 12, 31, 23, 59, 49), "mayonnaise"),
        (4, dt(2018, 12, 30, 0, 0), "chandelier"),
    ]

    # Since the table was replaced, we store it as a new snapshot instead of a patch.
    assert len(ingestion_test_repo.images["latest"].get_table("test_table").objects) == 1
Ejemplo n.º 12
0
def test_pandas_update_different_schema(ingestion_test_repo):
    # Currently patches with dataframes with different columns are unsupported
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    ingestion_test_repo.commit()

    # Delete the 'timestamp' column
    truncated_df = upd_df_1["timestamp"]

    with pytest.raises(ValueError) as e:
        df_to_table(truncated_df,
                    ingestion_test_repo,
                    "test_table",
                    if_exists="patch")
        assert "Schema changes are unsupported" in str(e.value)

    # Rename a column
    renamed_df = upd_df_1.copy()
    renamed_df.columns = ["timestamp", "name_rename"]

    with pytest.raises(ValueError) as e:
        df_to_table(renamed_df,
                    ingestion_test_repo,
                    "test_table",
                    if_exists="patch")
        assert "Schema changes are unsupported" in str(e.value)
Ejemplo n.º 13
0
    def run(self,
            request: DataFrameToTableRequest,
            repo_uri: str = None,
            **kwargs: Any):
        """

        Args:

        Returns:

        """
        assert repo_uri, 'Must specify repo_uri.'
        repo_info = parse_repo(repo_uri)

        repo = Repository(namespace=repo_info.namespace,
                          repository=repo_info.repository)

        df_to_table(request.data_frame,
                    repository=repo,
                    table=request.table,
                    if_exists=request.if_exists,
                    schema_check=self.schema_check)
Ejemplo n.º 14
0
    def run(self,
            data_frame: pd.DataFrame,
            table: str = None,
            if_exists: str = None,
            repo_uri: str = None,
            **kwargs: Any):
        """

        Args:

        Returns:

        """
        assert repo_uri, 'Must specify repo_uri.'
        repo_info = parse_repo(repo_uri)

        repo = Repository(namespace=repo_info.namespace,
                          repository=repo_info.repository)

        df_to_table(data_frame,
                    repository=repo,
                    table=table,
                    if_exists=if_exists)
Ejemplo n.º 15
0
    def test_can_push(self):
        checkout = SemanticCheckoutTask(upstream_repos=dict(
            abc=f'sgr://{remote_name}/abc/1234?tag=1', ), )

        workspaces = checkout.run()

        push = PushRepoTask(workspaces=workspaces, )

        df_to_table(fake_data(10),
                    repository=self.repo,
                    table="unit_test",
                    if_exists='replace')
        self.repo.commit()

        runner = TaskRunner(task=push)

        with raise_on_exception():
            with prefect.context():
                state = runner.run()

                if state.is_failed():
                    print(state)
                    self.fail()
Ejemplo n.º 16
0
    def test_can_commit(self):

        old_image_hash = self.repo.head.image_hash
        checkout = SemanticCheckoutTask(upstream_repos=dict(
            abc='sgr:///abc/1234?tag=1', ), )
        workspaces = checkout.run()

        commit = CommitTask(workspaces=workspaces, )

        df_to_table(fake_data(10),
                    repository=self.repo,
                    table="unit_test",
                    if_exists='replace')

        runner = TaskRunner(task=commit)
        with raise_on_exception():
            with prefect.context():
                state = runner.run()

                if state.is_failed():
                    print(state)
                    self.fail()
            self.assertNotEqual(self.repo.head.image_hash, old_image_hash)
Ejemplo n.º 17
0
def test_pandas_read_lq_checkout(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()
    # Record a DF with just the values for fruit_id = 3 and 4 (ones that will be updated).
    query = "SELECT * FROM test_table WHERE fruit_id IN (3, 4)"

    old_3_4 = sql_to_df(query, repository=ingestion_test_repo)

    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()
    new_3_4 = sql_to_df(query, repository=ingestion_test_repo)

    # Uncheckout the repo but don't delete it.
    ingestion_test_repo.uncheckout()
    assert ingestion_test_repo.head is None

    output_old = sql_to_df(query, image=old, use_lq=True)
    assert_frame_equal(old_3_4, output_old)

    output_new = sql_to_df(query, image=new, use_lq=True)
    assert_frame_equal(new_3_4, output_new)

    # Make sure we didn't do an actual checkout.
    assert ingestion_test_repo.head is None
Ejemplo n.º 18
0
def test_pandas_read_roundtripping(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()
    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()

    df = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo, index_col="fruit_id")

    # Pandas update syntax: update index 4 (fruit ID) to have a new timestamp.
    df.at[4, "timestamp"] = dt(2018, 1, 1, 1, 1, 1)

    # Write the whole df back in patch mode -- despite that, the changeset will only contain the updated cell.
    df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch")
    new_2 = ingestion_test_repo.commit()

    assert ingestion_test_repo.diff("test_table", new, new_2) == [
        (False, (4, dt(2018, 12, 30, 0, 0), "chandelier")),
        (True, (4, dt(2018, 1, 1, 1, 1, 1), "chandelier")),
    ]
    def write(self, value_: Any, **kwargs: Any) -> Result:
        """
        Writes the result to a repository on Splitgraph


        Args:
            - value_ (Any): the value to write; will then be stored as the `value` attribute
                of the returned `Result` instance
            - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag`

        Returns:
            - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes
        """

        if self.schema is not None:
            errors = self.schema.validate(value_)
            if errors:
                raise SchemaValidationError(errors)

        new = self.format(**kwargs)
        new.value = value_

        repo_info = parse_repo(new.location)

        repo = Repository(namespace=repo_info.namespace,
                          repository=repo_info.repository)
        remote = Repository.from_template(repo,
                                          engine=get_engine(
                                              repo_info.remote_name,
                                              autocommit=True))

        assert isinstance(value_, pd.DataFrame)

        if not repository_exists(repo) and self.auto_init_repo:
            self.logger.info("Creating repo {}/{}...".format(
                repo.namespace, repo.repository))
            repo.init()

        # TODO: Retrieve the repo from bedrock first

        self.logger.info("Starting to upload result to {}...".format(
            new.location))

        with self.atomic(repo.engine):
            self.logger.info("checkout")
            img = repo.head

            img.checkout(force=True)

            self.logger.info("df to table")
            df_to_table(new.value,
                        repository=repo,
                        table=repo_info.table,
                        if_exists='replace')

            self.logger.info("commit")
            new_img = repo.commit(comment=new.comment, chunk_size=10000)
            new_img.tag(repo_info.tag)

        # if (repo.diff(new.table, img, new_img)):
        if self.auto_push:
            self.logger.info("push")
            repo.push(
                remote,
                handler="S3",
                overwrite_objects=True,
                overwrite_tags=True,
                reupload_objects=True,
            )

        self.logger.info("Finished uploading result to {}...".format(
            new.location))

        return new
Ejemplo n.º 20
0
    def write(self, value_: Any, **kwargs: Any) -> Result:
        """
        Writes the result to a repository on Splitgraph


        Args:
            - value_ (Any): the value to write; will then be stored as the `value` attribute
                of the returned `Result` instance
            - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag`

        Returns:
            - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes
        """

        cfg = patch_config(create_config_dict(), self.env or dict())
        engine = PostgresEngine(name='SplitgraphResult', conn_params=cfg)
        engine.initialize()
        repo = Repository(namespace=self.namespace, repository=self.repo_name, engine=engine)

        assert isinstance(value_, pd.DataFrame)
        assert engine.connected

        if not repository_exists(repo) and self.auto_init_repo:
            self.logger.info("Creating repo {}/{}...".format(repo.namespace, repo.repository))
            repo.init()

        # TODO: Retrieve the repo from bedrock first

        new = self.format(**kwargs)
        new.value = value_

        self.logger.info("Starting to upload result to {}...".format(new.table))

        with self.atomic(engine):
            self.logger.info("checkout")
            img = repo.head
            img.checkout(force=True)

            self.logger.info("df to table")
            df_to_table(new.value, repository=repo, table=new.table, if_exists='replace')

            self.logger.info("commit")
            new_img = repo.commit(comment=new.comment, chunk_size=10000)
            new_img.tag(new.tag)


        # if (repo.diff(new.table, img, new_img)):
        if self.auto_push:
            self.logger.info("push")
            repo.push(
                self.get_upstream(repo),
                handler="S3",
                overwrite_objects=True,
                overwrite_tags=True,
                reupload_objects=True,
            )

        engine.close()
        self.logger.info("Finished uploading result to {}...".format(new.table))

        return new