def test_fetches_diff_none():
    with get_s3_client() as s3_client:
        input_key = "clay/beads"
        input_bucket = "kiln"
        comparison_key = "new-case"
        comparison_bucket = "storefront"
        partitions = ["price"]

        part_types = {"count": "int", "price": "float"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28]
        })

        s3_client.create_bucket(Bucket=input_bucket)
        s3_client.create_bucket(Bucket=comparison_bucket)

        setup_partitioned_parquet(dataframe=input_df,
                                  bucket=input_bucket,
                                  key=input_key,
                                  partition_data_types={"price": "float"},
                                  s3_client=s3_client)

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        fetched_diff.sort_values(by=['price'], inplace=True)
        input_df.sort_values(by=['price'], inplace=True)

        sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df)

        fetched_diff_reverse = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            reverse=True,
            parallel=False)

        assert fetched_diff_reverse.empty

        fetched_diff_reverse_both = fetch_parq.fetch_diff(
            input_bucket=comparison_bucket,
            input_key=comparison_key,
            comparison_bucket=input_bucket,
            comparison_key=input_key,
            partition=partitions[0],
            reverse=True,
            parallel=False)

        sorted_dfs_equal_by_pandas_testing(fetched_diff_reverse_both, input_df)
def test_fetches_diff():
    with get_s3_client() as s3_client:
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        comparison_key = "burger-inventory/buns"
        comparison_bucket = "backroom"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "string"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9, 9],
            "price": [2.43, 1.23, 5.76, 3.28, 4.44],
            "exp-date": ["x", "z", "a", "zz", "l"]
        })
        comparison_df = pd.DataFrame({
            "count": [2, 3, 4, 9],
            "price": [2.43, 4.35, 1.23, 3.28],
            "exp-date": ["x", "y", "z", "zz"]
        })

        setup_partitioned_parquet(dataframe=input_df,
                                  bucket=input_bucket,
                                  key=input_key,
                                  partition_data_types={"exp-date": "string"},
                                  s3_client=s3_client)

        setup_partitioned_parquet(dataframe=comparison_df,
                                  bucket=comparison_bucket,
                                  key=comparison_key,
                                  partition_data_types={"exp-date": "string"},
                                  s3_client=s3_client)

        test_df = pd.DataFrame({
            "count": [7, 9],
            "price": [5.76, 4.44],
            "exp-date": ["a", "l"]
        })

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        assert fetched_diff.shape == test_df.shape
        sorted_dfs_equal_by_pandas_testing(fetched_diff, test_df)
def test_s3_parquet_to_dataframe():
    with get_s3_client() as s3_client:
        columns = {
            "string_col": "string",
            "int_col": "integer",
            "float_col": "float",
            "bool_col": "boolean",
            "datetime_col": "datetime"
        }

        bucket = "foobucket"
        key = "fookey"

        df = setup_grouped_dataframe(count=10, columns=columns)
        bucket, parquet_paths = setup_partitioned_parquet(
            dataframe=df,
            bucket=bucket,
            key=key,
            partition_data_types={},
            s3_client=s3_client)

        first_published_file = parquet_paths[0]
        response = fetch_parq._s3_parquet_to_dataframe(
            bucket=bucket, key=first_published_file, partition_metadata={})

        assert isinstance(response, pd.DataFrame)
        for col in columns.keys():
            assert (col in response.columns)

        assert response.shape == df.shape
        sorted_dfs_equal_by_pandas_testing(response, df)
Exemple #4
0
def test_s3_partitioned_parquet_to_dataframe():
    partition_types = {"string_col": "string",
                       "int_col": "integer",
                       "float_col": "float",
                       "bool_col": "boolean",
                       "datetime_col": "datetime"}
    columns = dict(partition_types)
    columns["metrics"] = "int"

    bucket = "foobucket"
    key = "fookey"

    df = setup_grouped_dataframe(count=10, columns=columns)
    bucket, parquet_paths = setup_partitioned_parquet(
        dataframe=df,
        bucket=bucket,
        key=key,
        partition_data_types=partition_types
    )

    first_published_file = parquet_paths[0]
    response = fetch_parq._s3_parquet_to_dataframe(
        bucket=bucket, key=first_published_file, partition_metadata=partition_types)

    assert isinstance(response, pd.DataFrame)
    for col in columns.keys():
        assert (col in response.columns)

    full_response = pd.DataFrame()
    for path in parquet_paths:
        full_response = full_response.append(fetch_parq._s3_parquet_to_dataframe(
            bucket=bucket, key=path, partition_metadata=partition_types))

    assert full_response.shape == df.shape
    sorted_dfs_equal_by_pandas_testing(full_response, df)
Exemple #5
0
def test_fetch_when_none():
    input_key = "burger-shipment/buns"
    input_bucket = "loadingdock"
    partitions = ["exp-date"]

    part_types = {"count": "int", "price": "float", "exp-date": "str"}

    fetched_dtypes = pd.Series(["int64", "float64", "object"],
                               index=["count", "price", "exp-date"])

    input_df = pd.DataFrame({
        "count": [2, 4, 7, 9],
        "price": [2.43, 1.23, 5.76, 3.28],
        "exp-date": ["x", "z", "a", "zz"]
    })

    s3_client = boto3.client('s3')

    setup_partitioned_parquet(dataframe=input_df,
                              bucket=input_bucket,
                              key=input_key,
                              partition_data_types={"exp-date": "string"},
                              s3_client=s3_client)

    filters = [{
        "partition": "exp-date",
        "comparison": "==",
        "values": ["not-there"]
    }]

    fetched = fetch_parq.fetch(bucket=input_bucket,
                               key=input_key,
                               filters=filters,
                               parallel=False)

    # Testing that DF is empty and has the expected columns+dtypes
    assert fetched.empty
    assert fetched.dtypes.equals(fetched_dtypes)
Exemple #6
0
def test_get_data_types_from_s3():
    bucket, parquet_paths = setup_partitioned_parquet()

    s3_client = boto3.client('s3')
    files = s3_client.list_objects_v2(Bucket=bucket)
    first_file_key = files["Contents"][0]["Key"]
    partition_metadata = fetch_parq._get_partitions_and_types(
        first_file_key, bucket)

    assert partition_metadata == {
        "string_col": "string",
        "int_col": "integer",
        "float_col": "float",
        "bool_col": "boolean",
        "datetime_col": "datetime"
    }
Exemple #7
0
def test_gets_max_denies_text():
    key = "safekeyprefixname/safedatasetname"
    bucket = "safebucketname"
    part_types = {"string_col": "string", "bool_col": "bool"}
    col_types = dict(part_types)
    col_types["metrics"] = "int"
    df = setup_grouped_dataframe(count=10, columns=col_types)
    bucket, parquet_paths = setup_partitioned_parquet(
        dataframe=df, bucket=bucket, key=key, partition_data_types=part_types)

    with pytest.raises(ValueError):
        fetched_max = fetch_parq.get_max_partition_value(
            bucket=bucket, key=key, partition="string_col")

    with pytest.raises(ValueError):
        fetched_max = fetch_parq.get_max_partition_value(bucket=bucket,
                                                         key=key,
                                                         partition="bool_col")
Exemple #8
0
def test_gets_max():
    key = "safekeyprefixname/safedatasetname"
    bucket = "safebucketname"
    part_types = {"int_col": "int", "float_col": "float"}

    df = setup_grouped_dataframe(count=10, columns=part_types)
    bucket, parquet_paths = setup_partitioned_parquet(
        dataframe=df,
        bucket=bucket,
        key=key,
        partition_data_types={"int_col": "int"})

    fetched_max = fetch_parq.get_max_partition_value(bucket=bucket,
                                                     key=key,
                                                     partition="int_col")

    # Test max of column is max of the fetched partition
    assert df["int_col"].max() == fetched_max