Ejemplo n.º 1
0
def test_pandas_read_basic(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()

    # We currently don't detect the index column name since it's an arbitrary query that's passed.
    output = sql_to_df("SELECT * FROM test_table",
                       repository=ingestion_test_repo,
                       index_col="fruit_id")

    assert_frame_equal(base_df, output)
Ejemplo n.º 2
0
def test_pandas_read_other_checkout(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()
    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()

    # Record the second version of the table
    patched_df = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo)

    # Check out the old version but run the query against the new image -- new version should come out.
    old.checkout()
    output_1 = sql_to_df("SELECT * FROM test_table", image=new)
    assert_frame_equal(output_1, patched_df)

    # Test works with hashes
    output_2 = sql_to_df(
        "SELECT * FROM test_table", image=new.image_hash[:10], repository=ingestion_test_repo
    )
    assert_frame_equal(output_2, patched_df)
Ejemplo n.º 3
0
def test_pandas_kv(ingestion_test_repo):
    # Test reads and writes with a key-value type dataframe.
    df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df_kv.csv"),
                     index_col=0)
    df_to_table(df, ingestion_test_repo, "test_table")

    # Test patching works without specifying an index col
    df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "patch_df_kv.csv"))
    df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch")

    assert_frame_equal(
        sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo),
        pd.DataFrame(data=[(1, "banana"), (2, "kumquat"), (3, "pendulum")],
                     columns=["key", "value"]),
    )
Ejemplo n.º 4
0
def test_pandas_read_lq_checkout(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()
    # Record a DF with just the values for fruit_id = 3 and 4 (ones that will be updated).
    query = "SELECT * FROM test_table WHERE fruit_id IN (3, 4)"

    old_3_4 = sql_to_df(query, repository=ingestion_test_repo)

    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()
    new_3_4 = sql_to_df(query, repository=ingestion_test_repo)

    # Uncheckout the repo but don't delete it.
    ingestion_test_repo.uncheckout()
    assert ingestion_test_repo.head is None

    output_old = sql_to_df(query, image=old, use_lq=True)
    assert_frame_equal(old_3_4, output_old)

    output_new = sql_to_df(query, image=new, use_lq=True)
    assert_frame_equal(new_3_4, output_new)

    # Make sure we didn't do an actual checkout.
    assert ingestion_test_repo.head is None
Ejemplo n.º 5
0
def test_pandas_read_roundtripping(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    old = ingestion_test_repo.commit()
    df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch")
    new = ingestion_test_repo.commit()

    df = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo, index_col="fruit_id")

    # Pandas update syntax: update index 4 (fruit ID) to have a new timestamp.
    df.at[4, "timestamp"] = dt(2018, 1, 1, 1, 1, 1)

    # Write the whole df back in patch mode -- despite that, the changeset will only contain the updated cell.
    df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch")
    new_2 = ingestion_test_repo.commit()

    assert ingestion_test_repo.diff("test_table", new, new_2) == [
        (False, (4, dt(2018, 12, 30, 0, 0), "chandelier")),
        (True, (4, dt(2018, 1, 1, 1, 1, 1), "chandelier")),
    ]
Ejemplo n.º 6
0
    def run(self, repo_uri: str = None, query: str = None, schema: Schema = None, layer_query: bool = None, **kwargs: Any):
        """  

        Args:

        Returns:
            - No return
        """
        assert repo_uri, 'Must specify repo_uri.'
        repo_info = parse_repo(repo_uri)

        repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository)
        data = sql_to_df(query, repository=repo, use_lq=layer_query)        

        if schema is not None:
            errors = schema.validate(data)
            if errors:
                raise SchemaValidationError(errors)
        
        return data
    def read(self, location: str) -> Result:
        new = self.copy()
        new.location = location
        try:

            repo = Repository(namespace=new.repo_info.namespace,
                              repository=new.repo_info.repository)
            remote = Repository.from_template(repo,
                                              engine=get_engine(
                                                  new.repo_info.remote_name,
                                                  autocommit=True))

            cloned_repo = clone(
                remote,
                local_repository=repo,
                download_all=True,
                overwrite_objects=True,
                overwrite_tags=True,
                single_image=new.repo_info.tag,
            )
            data = sql_to_df(f"SELECT * FROM {new.repo_info.table}",
                             repository=cloned_repo,
                             use_lq=self.layer_query)

            if self.schema is not None:
                errors = self.schema.validate(data)
                if errors:
                    raise SchemaValidationError(errors)

            new.value = data
        except Exception as exc:
            self.logger.exception(
                "Unexpected error while reading from result handler: {}".
                format(repr(exc)))
            raise exc

        return new
Ejemplo n.º 8
0
import scipy.stats as ss

from splitgraph.core.repository import Repository
from splitgraph.ingestion.pandas import sql_to_df

# Load the dataset we created into Pandas
image = Repository("", "qoz_vote_fraction").images["latest"]
df = sql_to_df("SELECT * FROM qoz_vote_fraction", image=image, use_lq=True)
print(df)

# Is there a correlation between the Trump vote fraction and the fraction of
# QOZ-qualified tracts in every county?
print(ss.linregress(df["trump_vote_fraction"], df["qoz_tract_fraction"]))