def test_fetches_diff_none(): with get_s3_client() as s3_client: input_key = "clay/beads" input_bucket = "kiln" comparison_key = "new-case" comparison_bucket = "storefront" partitions = ["price"] part_types = {"count": "int", "price": "float"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28] }) s3_client.create_bucket(Bucket=input_bucket) s3_client.create_bucket(Bucket=comparison_bucket) setup_partitioned_parquet(dataframe=input_df, bucket=input_bucket, key=input_key, partition_data_types={"price": "float"}, s3_client=s3_client) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) fetched_diff.sort_values(by=['price'], inplace=True) input_df.sort_values(by=['price'], inplace=True) sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df) fetched_diff_reverse = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], reverse=True, parallel=False) assert fetched_diff_reverse.empty fetched_diff_reverse_both = fetch_parq.fetch_diff( input_bucket=comparison_bucket, input_key=comparison_key, comparison_bucket=input_bucket, comparison_key=input_key, partition=partitions[0], reverse=True, parallel=False) sorted_dfs_equal_by_pandas_testing(fetched_diff_reverse_both, input_df)
def test_fetches_diff_none(self): input_key = "clay/beads" input_bucket = "kiln" comparison_key = "new-case" comparison_bucket = "storefront" partitions = ["price"] part_types = {"count": "int", "price": "float"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28] }) s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=input_bucket) s3_client.create_bucket(Bucket=comparison_bucket) published_files = publish(bucket=input_bucket, key=input_key, dataframe=input_df, partitions=partitions) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) assert fetched_diff.empty fetched_diff_reverse = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], reverse=True, parallel=False) fetched_diff_reverse.sort_index(inplace=True) input_df.sort_index(inplace=True) assert fetched_diff_reverse['count'].equals(input_df['count']) assert fetched_diff_reverse['price'].equals(input_df['price'])
def test_fetches_diff(self): input_key = "burger-shipment/buns" input_bucket = "loadingdock" comparison_key = "burger-inventory/buns" comparison_bucket = "backroom" partitions = ["exp-date"] part_types = {"count": "int", "price": "float", "exp-date": "string"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28], "exp-date": ["x", "z", "a", "zz"] }) comparison_df = pd.DataFrame({ "count": [2, 3, 4, 9], "price": [2.43, 4.35, 1.23, 3.28], "exp-date": ["x", "y", "z", "zz"] }) s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=input_bucket) s3_client.create_bucket(Bucket=comparison_bucket) published_files = publish(bucket=input_bucket, key=input_key, dataframe=input_df, partitions=partitions) published_files = publish(bucket=comparison_bucket, key=comparison_key, dataframe=comparison_df, partitions=partitions) test_df = pd.DataFrame({ "count": [7], "price": [5.76], "exp-date": ["a"] }) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) # Test data knows these are single row-ed DFs, testing that data # like this cause pandas DF equals is ??? assert fetched_diff.iloc[0].equals(test_df.iloc[0])
def test_fetches_diff(): with get_s3_client() as s3_client: input_key = "burger-shipment/buns" input_bucket = "loadingdock" comparison_key = "burger-inventory/buns" comparison_bucket = "backroom" partitions = ["exp-date"] part_types = {"count": "int", "price": "float", "exp-date": "string"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9, 9], "price": [2.43, 1.23, 5.76, 3.28, 4.44], "exp-date": ["x", "z", "a", "zz", "l"] }) comparison_df = pd.DataFrame({ "count": [2, 3, 4, 9], "price": [2.43, 4.35, 1.23, 3.28], "exp-date": ["x", "y", "z", "zz"] }) setup_partitioned_parquet(dataframe=input_df, bucket=input_bucket, key=input_key, partition_data_types={"exp-date": "string"}, s3_client=s3_client) setup_partitioned_parquet(dataframe=comparison_df, bucket=comparison_bucket, key=comparison_key, partition_data_types={"exp-date": "string"}, s3_client=s3_client) test_df = pd.DataFrame({ "count": [7, 9], "price": [5.76, 4.44], "exp-date": ["a", "l"] }) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) assert fetched_diff.shape == test_df.shape sorted_dfs_equal_by_pandas_testing(fetched_diff, test_df)
def fetch_diff(self, input_bucket: str, input_key: str, comparison_bucket: str, comparison_key: str, partition: str, parallel: bool = True ) -> pd.DataFrame: return fetch_diff( input_bucket = input_bucket, input_key = input_key, comparison_bucket = comparison_bucket, comparison_key = comparison_key, partition = partition, parallel = parallel )