def test_simple_regex_example_with_implicit_data_asset_names_self_check(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "A-100.csv", "A-101.csv", "B-1.csv", "B-2.csv", "CCC.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": [ "data_asset_name", "number", ], }, bucket=bucket, prefix="", ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetS3DataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": ["CCC.csv"], "unmatched_data_reference_count": 1, # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def test_basic_instantiation(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, bucket=bucket, prefix="", ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == [] # Illegal execution environment name with pytest.raises(ValueError): print( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", ) ) )
def test_complex_regex_example_with_implicit_data_asset_names(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "2020/01/alpha-1001.csv", "2020/01/beta-1002.csv", "2020/02/alpha-1003.csv", "2020/02/beta-1004.csv", "2020/03/alpha-1005.csv", "2020/03/beta-1006.csv", "2020/04/beta-1007.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, bucket=bucket, prefix="", ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() # Test for an unknown execution environment with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="non_existent_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset", )) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", )) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ))) == 3) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="alpha", ))) == 3) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="beta", ))) == 4) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_request={ "partition_identifiers": { "year_dir": "2020", "month_dir": "03", } }, )) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_definition=PartitionDefinition( year_dir="2020", month_dir="03", ), ) ]