def test_s3_select_b_filtered_from_small_multicolumn_parquet(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) response = s3.select_object_content(Bucket='s3filter', Key='parquet/small.multicolumn.9999.parquet', Expression='select a from s3Object where a < 5000', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'Parquet': {} }, OutputSerialization={ 'CSV': {} }) df = None cursor = PandasCursor(None) cursor.event_stream = response['Payload'] dfs = cursor.parse_event_stream() for partial_df in dfs: if df is None: df = partial_df else: df = pd.concat(df, partial_df) assert len(df) == 5000 print() print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_scanned)) print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_processed)) print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_returned))
def test_s3_select_from_csv(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.csv', Expression='select sum(cast(s_acctbal as float)) from s3Object', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'CSV': {'FileHeaderInfo': 'Use', 'RecordDelimiter': '\n', 'FieldDelimiter': '|'} }, OutputSerialization={ 'CSV': {} }) df = None cursor = PandasCursor(None) cursor.event_stream = response['Payload'] dfs = cursor.parse_event_stream() for partial_df in dfs: if df is None: df = partial_df else: df = pd.concat(df, partial_df) assert len(df) == 1 assert pd.to_numeric(df.iloc[0]['_0']) == pytest.approx(45103548.64999)
def test_filtered_s3_select_from_parquet(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet', Expression='select * from s3Object where cast(s_acctbal as float) > 500.0 ', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'Parquet': {} }, OutputSerialization={ 'CSV': {} }) df = None cursor = PandasCursor(None) cursor.event_stream = response['Payload'] dfs = cursor.parse_event_stream() for partial_df in dfs: if df is None: df = partial_df else: df = pd.concat(df, partial_df) assert_supplier_table(df, 8642, use_ordinal_columns=True)
def run(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) response = s3.select_object_content( Bucket='s3filter', Key='parquet/supplier.large.parquet', Expression='select sum(cast(s_suppkey as int)) from s3Object', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'Parquet': {} }, OutputSerialization={'CSV': {}}) df = None cursor = PandasCursor(None) cursor.event_stream = response['Payload'] dfs = cursor.parse_event_stream() for partial_df in dfs: if df is None: df = partial_df else: df = pd.concat(df, partial_df) assert len(df) == 1 # assert pd.to_numeric(df.iloc[0]['_0']) == pytest.approx(22551774325.00404) print("{} | {}".format(test_s3_select_from_large_parquet.__name__, cursor.bytes_scanned)) print("{} | {}".format(test_s3_select_from_large_parquet.__name__, cursor.bytes_processed)) print("{} | {}".format(test_s3_select_from_large_parquet.__name__, cursor.bytes_returned))
def test_projected_s3_select_from_parquet(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet', Expression='select s_suppkey from s3Object', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'Parquet': {} }, OutputSerialization={ 'CSV': {} }) df = None cursor = PandasCursor(None) cursor.event_stream = response['Payload'] dfs = cursor.parse_event_stream() for partial_df in dfs: if df is None: df = partial_df else: df = pd.concat(df, partial_df) assert len(df) == 10000 rows = df[df['_0'] == '1'] assert len(rows) == 1 row = rows.iloc[0] assert row['_0'] == "1"
def test_projected_vs_all_s3_select_from_parquet(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) response1 = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet', Expression='select s_suppkey from s3Object', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'Parquet': {} }, OutputSerialization={ 'CSV': {} }) df1 = None cursor1 = PandasCursor(None) cursor1.event_stream = response1['Payload'] dfs1 = cursor1.parse_event_stream() for partial_df1 in dfs1: if df1 is None: df1 = partial_df1 else: df1 = pd.concat(df1, partial_df1) response2 = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet', Expression='select * from s3Object', ExpressionType='SQL', InputSerialization={ 'CompressionType': 'NONE', 'Parquet': {} }, OutputSerialization={ 'CSV': {} }) df2 = None cursor2 = PandasCursor(None) cursor2.event_stream = response2['Payload'] dfs2 = cursor2.parse_event_stream() for partial_df2 in dfs2: if df2 is None: df2 = partial_df2 else: df2 = pd.concat(df2, partial_df2) assert len(df1) == 10000 assert len(df2) == 10000 rows1 = df1[df1['_0'] == '1'] rows2 = df2[df2['_0'] == '1'] assert len(rows1) == 1 assert len(rows2) == 1 row1 = rows1.iloc[0] row2 = rows2.iloc[0] assert row1['_0'] == "1" assert row2['_0'] == "1" assert cursor2.bytes_scanned > cursor1.bytes_scanned