def test_table_publish(self, mock_session_helper, mock_create_table): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = ["text_col", "int_col", "float_col"] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) df_types = parq._get_dataframe_datatypes(dataframe, partitions) partition_types = parq._get_dataframe_datatypes( dataframe, partitions, True) mock_create_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], df_types, partition_types, parq.s3_url(bucket, key), msh)
def test_reject_timedelta_dataframes(self): columns, dataframe = self.setup_df() bucket, key = self.setup_s3() partitions = ['text_col'] dataframe['time_col'] = pd.Timedelta('1 days') with pytest.raises(NotImplementedError): parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_works_without_partitions(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_reject_empty_dataframe(self): dataframe = pd.DataFrame() bucket, key = self.setup_s3() s3_path = f"s3://{bucket}/{key}" with pytest.raises(ValueError): parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=[])
def test_no_redshift_publish(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_works_without_partitions(self): columns, dataframe = self.setup_df() bucket, key = self.setup_s3() partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_no_redshift_publish(self): columns, dataframe = self.setup_df() bucket, key = self.setup_s3() partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_no_redshift_publish(self): with get_s3_client() as s3_client: dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3(s3_client) partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_reject_timedelta_dataframes(self): with get_s3_client() as s3_client: dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3(s3_client) partitions = ['text_col'] dataframe['time_col'] = pd.Timedelta('1 days') with pytest.raises(NotImplementedError): parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_set_metadata_correctly(self): columns, dataframe = self.setup_df() bucket, key = self.setup_s3() s3_client = boto3.client('s3') partitions = ['grouped_col'] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions) for obj in s3_client.list_objects(Bucket=bucket)['Contents']: if obj['Key'].endswith(".parquet"): meta = s3_client.get_object( Bucket=bucket, Key=obj['Key'])['Metadata'] assert meta['partition_data_types'] == str( {"grouped_col": "string"})
def test_fetches_diff(self): input_key = "burger-shipment/buns" input_bucket = "loadingdock" comparison_key = "burger-inventory/buns" comparison_bucket = "backroom" partitions = ["exp-date"] part_types = {"count": "int", "price": "float", "exp-date": "string"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28], "exp-date": ["x", "z", "a", "zz"] }) comparison_df = pd.DataFrame({ "count": [2, 3, 4, 9], "price": [2.43, 4.35, 1.23, 3.28], "exp-date": ["x", "y", "z", "zz"] }) s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=input_bucket) s3_client.create_bucket(Bucket=comparison_bucket) published_files = publish(bucket=input_bucket, key=input_key, dataframe=input_df, partitions=partitions) published_files = publish(bucket=comparison_bucket, key=comparison_key, dataframe=comparison_df, partitions=partitions) test_df = pd.DataFrame({ "count": [7], "price": [5.76], "exp-date": ["a"] }) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) # Test data knows these are single row-ed DFs, testing that data # like this cause pandas DF equals is ??? assert fetched_diff.iloc[0].equals(test_df.iloc[0])
def test_via_public_interface(): s3_client = boto3.client('s3') bucket_name = 'another-bucket' key = 'testing/is/fun/dataset-name' s3_client.create_bucket(Bucket=bucket_name) publish(bucket=bucket_name, key=key, dataframe=df.dataframe, partitions=['datetime_options']) ## moto explodes when we use parallel :( need to test this with a real boto call result = fetch(bucket=bucket_name, key=key, parallel=False) assert result.shape == df.dataframe.shape assert df_equal_by_set(result, df.dataframe, df.dataframe.columns.tolist())
def test_input_equals_output(self): columns, dataframe = self.setup_df() bucket, key = self.setup_s3() s3_path = f"s3://{bucket}/{key}" partitions = [columns[0]] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions) from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem()) s3pd = from_s3.read().to_pandas() pre_df = dataframe assert set(zip(s3pd.int_col, s3pd.float_col, s3pd.text_col, s3pd.grouped_col)) - \ set(zip(dataframe.int_col, dataframe.float_col, dataframe.text_col, dataframe.grouped_col)) == set()
def test_parquet_sizes(): bucket = "testbucket" key = "testdataset" s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=bucket) df = DFMock(count=1000) df.columns = {"int_col": "int", "str_col": "string", "grouped_col": {"option_count": 4, "option_type": "string"}} df.generate_dataframe() df.grow_dataframe_to_size(250) pub_parq.publish( dataframe=df.dataframe, key=key, bucket=bucket, partitions=['grouped_col']) for obj in s3_client.list_objects(Bucket=bucket)['Contents']: if obj['Key'].endswith(".parquet"): assert float(obj['Size']) <= 61 * float(1 << 20)
def test_input_equals_output(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() s3_path = f"s3://{bucket}/{key}" partitions = [dataframe.columns[0]] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions) from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem()) s3pd = from_s3.read().to_pandas() # Switch partition type back -> by default it gets set to a category s3pd[partitions[0]] = s3pd[partitions[0]].astype( dataframe[partitions[0]].dtype) sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
def test_schema_publish(self, mock_session_helper, mock_create_schema): columns, dataframe = self.setup_df() bucket, key = self.setup_s3() partitions = [columns[0]] redshift_params = self.setup_redshift_params() msh = mock_session_helper( region = redshift_params['region'], cluster_id = redshift_params['cluster_id'], host = redshift_params['host'], port = redshift_params['port'], db_name = redshift_params['db_name'] ) msh.configure_session_helper() parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) mock_create_schema.assert_called_once_with(redshift_params['schema_name'], redshift_params['db_name'], redshift_params['iam_role'], msh)
def test_end_to_end(): df = dfmock.DFMock(count=100000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() # This is unfortunately big, but getting it to force a partition doesn't work otherwise df.grow_dataframe_to_size(500) s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) old_df = pd.DataFrame(df.dataframe) # pub it publish(bucket=bucket_name, key=key, dataframe=old_df, partitions=['string_options', 'datetime_options', 'float_options']) # go get it fetched_df = fetch(bucket=bucket_name, key=key, parallel=False) assert fetched_df.shape == old_df.shape assert df_equal_by_set(fetched_df, old_df, old_df.columns) sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
def test_end_to_end(): df = dfmock.DFMock(count=1000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() df.grow_dataframe_to_size(250) s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) # pub it publish(bucket=bucket_name, key=key, dataframe=df.dataframe, partitions=['string_options', 'datetime_options', 'float_options']) # go get it fetched_df = fetch(bucket=bucket_name, key=key, parallel=False) assert fetched_df.shape == df.dataframe.shape pd.DataFrame.eq(fetched_df, df.dataframe) fetched_df.head()
def test_end_to_end(): # make a sample DF for all the tests df = dfmock.DFMock(count=10000) df.columns = { "string_options": { "option_count": 4, "option_type": "string" }, "int_options": { "option_count": 4, "option_type": "int" }, "datetime_options": { "option_count": 5, "option_type": "datetime" }, "float_options": { "option_count": 2, "option_type": "float" }, "metrics": "integer" } df.generate_dataframe() s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) old_df = pd.DataFrame(df.dataframe) # pub it publish(bucket=bucket_name, key=key, dataframe=old_df, partitions=['string_options', 'datetime_options', 'float_options']) # go get it fetched_df = fetch(bucket=bucket_name, key=key, parallel=False) assert fetched_df.shape == old_df.shape assert df_equal_by_set(fetched_df, old_df, old_df.columns) sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
def test_end_to_end(): s3_client = boto3.client('s3') bucket_name = 'thistestbucket' key = 'thisdataset' s3_client.create_bucket(Bucket=bucket_name) # pub it publish(bucket=bucket_name, key=key, dataframe=df.dataframe, partitions=['string_options', 'datetime_options', 'float_options']) # go get it dataframe = fetch(bucket=bucket_name, key=key, parallel=False) assert dataframe.shape == df.dataframe.shape pd.DataFrame.eq(dataframe, df.dataframe) dataframe.head()
def publish(self, bucket: str, key: str, dataframe: pd.DataFrame, partitions: Iterable[str]) -> List: return publish( dataframe=dataframe, bucket=bucket, key=key, partitions=partitions )
def publish(self, bucket: str, key: str, dataframe: pd.DataFrame, partitions: Iterable[str]) -> None: pub = publish( dataframe=dataframe, bucket=bucket, key=key, partitions=partitions ) pub.publish()
def test_set_metadata_correctly(self): with get_s3_client() as s3_client: dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3(s3_client) s3_client = boto3.client('s3') partitions = ['string_col', 'int_col', 'bool_col'] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions) for obj in s3_client.list_objects(Bucket=bucket)['Contents']: if obj['Key'].endswith(".parquet"): meta = s3_client.get_object(Bucket=bucket, Key=obj['Key'])['Metadata'] assert meta['partition_data_types'] == str({ "string_col": "string", "int_col": "integer", "bool_col": "boolean" })
def test_fetches_diff_none(self): input_key = "clay/beads" input_bucket = "kiln" comparison_key = "new-case" comparison_bucket = "storefront" partitions = ["price"] part_types = {"count": "int", "price": "float"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28] }) s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=input_bucket) s3_client.create_bucket(Bucket=comparison_bucket) published_files = publish(bucket=input_bucket, key=input_key, dataframe=input_df, partitions=partitions) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) assert fetched_diff.empty fetched_diff_reverse = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], reverse=True, parallel=False) fetched_diff_reverse.sort_index(inplace=True) input_df.sort_index(inplace=True) assert fetched_diff_reverse['count'].equals(input_df['count']) assert fetched_diff_reverse['price'].equals(input_df['price'])
def mock_publish(self, partition_types: Dict[str, str], bucket="safebucketname", key='safekeyprefixname/safedatasetname'): mocker = MockHelper(count=100, s3=True) df = mocker.dataframe partitions = list(partition_types.keys()) dfmock = DFMock() dfmock.count = 10 # add partition columns columns = dict({ key: { "option_count": 3, "option_type": value } for key, value in partition_types.items() }) # add one actual data column, called metrics columns["metrics"] = "int" dfmock.columns = columns dfmock.generate_dataframe() # generate dataframe we will write df = dfmock.dataframe bucket = mocker.s3_bucket defaults = { 'bucket': bucket, 'key': key, 'dataframe': df, 'partitions': partitions } published_files = publish(bucket=bucket, key=key, dataframe=df, partitions=partitions) return bucket, df, partitions, published_files
def test_fetch_when_none(self): input_key = "burger-shipment/buns" input_bucket = "loadingdock" partitions = ["exp-date"] part_types = {"count": "int", "price": "float", "exp-date": "str"} fetched_dtypes = pd.Series(["int64", "float64", "object"], index=["count", "price", "exp-date"]) input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28], "exp-date": ["x", "z", "a", "zz"] }) s3_client = boto3.client('s3') s3_client.create_bucket(Bucket=input_bucket) published_files = publish(bucket=input_bucket, key=input_key, dataframe=input_df, partitions=partitions) filters = [{ "partition": "exp-date", "comparison": "==", "values": ["not-there"] }] fetched = fetch_parq.fetch(bucket=input_bucket, key=input_key, filters=filters, parallel=False) # Testing that DF is empty and has the expected columns+dtypes assert fetched.empty assert fetched.dtypes.equals(fetched_dtypes)