def test_fetches_nons3parq_large_parquet(): input_key = "burger-shipment/buns" input_bucket = "loadingdock" df = dfmock.DFMock(count=100000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() # This is unfortunately big, but getting it to force a partition doesn't work otherwise df.grow_dataframe_to_size(500) input_df = pd.DataFrame(df.dataframe) s3_client = boto3.client('s3') s3_key = "burger-shipment/buns" setup_nons3parq_parquet(dataframe=input_df, bucket=input_bucket, key=input_key, s3_client=s3_client) fetched_diff = fetch_parq.fetch(bucket=input_bucket, key=s3_key, parallel=False) assert fetched_diff.shape == input_df.shape sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df)
def test_end_to_end(): df = dfmock.DFMock(count=100000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() # This is unfortunately big, but getting it to force a partition doesn't work otherwise df.grow_dataframe_to_size(500) s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) old_df = pd.DataFrame(df.dataframe) # pub it publish(bucket=bucket_name, key=key, dataframe=old_df, partitions=['string_options', 'datetime_options', 'float_options']) # go get it fetched_df = fetch(bucket=bucket_name, key=key, parallel=False) assert fetched_df.shape == old_df.shape assert df_equal_by_set(fetched_df, old_df, old_df.columns) sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
def test_end_to_end(): df = dfmock.DFMock(count=1000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() df.grow_dataframe_to_size(250) s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) # pub it publish(bucket=bucket_name, key=key, dataframe=df.dataframe, partitions=['string_options', 'datetime_options', 'float_options']) # go get it fetched_df = fetch(bucket=bucket_name, key=key, parallel=False) assert fetched_df.shape == df.dataframe.shape pd.DataFrame.eq(fetched_df, df.dataframe) fetched_df.head()
def test_end_to_end(): # make a sample DF for all the tests df = dfmock.DFMock(count=10000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) old_df = pd.DataFrame(df.dataframe) # pub it publish(bucket=bucket_name, key=key, dataframe=old_df, partitions=['string_options', 'datetime_options', 'float_options']) # go get it fetched_df = fetch(bucket=bucket_name, key=key, parallel=False) assert fetched_df.shape == old_df.shape assert df_equal_by_set(fetched_df, old_df, old_df.columns) sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
import boto3 import moto import s3parq import pytest import dfmock from s3parq.testing_helper import df_equal_by_set from s3parq.publish_parq import publish from s3parq.fetch_parq import fetch import pandas as pd # make a sample DF for all the tests df = dfmock.DFMock(count=10000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" }