def test_batch_definition_instantiation(): with pytest.raises(TypeError): A = BatchDefinition("A", "a", "aaa", {"id": "A"}) A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) print(A.id)
def test_partition_request_partition_request_partition_identifiers_1_key( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # no limit returned_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request={ "partition_identifiers": { "timestamp": "20200809" }, }, )) assert len(returned_batch_definition_list) == 4 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "alex", "timestamp": "20200809", "price": "1000" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), ] assert returned_batch_definition_list == expected
def test_custom_list(periodic_table_of_elements): Hydrogen = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"element": "Hydrogen"}), ) Helium = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"element": "Helium"}), ) Lithium = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"element": "Lithium"}), ) batch_list = [Hydrogen, Helium, Lithium] my_sorter = CustomListSorter(name="element", orderby="desc", reference_list=periodic_table_of_elements) sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [Lithium, Helium, Hydrogen] my_sorter = CustomListSorter(name="element", orderby="asc", reference_list=periodic_table_of_elements) sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [Hydrogen, Helium, Lithium]
def test_create_three_batch_definitions_sort_lexicographically(): a = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"id": "A"}), ) b = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"id": "B"}), ) c = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"id": "C"}), ) batch_list = [a, b, c] # sorting by "id" reverse alphabetically (descending) my_sorter = LexicographicSorter(name="id", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, ) assert sorted_batch_list == [c, b, a] # sorting by "id" reverse alphabetically (ascending) my_sorter = LexicographicSorter(name="id", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, ) assert sorted_batch_list == [a, b, c]
def test_sorter_instantiation_custom_list_with_periodic_table( periodic_table_of_elements, ): # CustomListSorter sorter_params: dict = { "reference_list": periodic_table_of_elements, } my_custom_sorter = CustomListSorter(name="element", orderby="asc", **sorter_params) # noinspection PyProtectedMember assert my_custom_sorter._reference_list == periodic_table_of_elements # This element exists : Hydrogen test_batch_def = BatchDefinition( datasource_name="test", data_connector_name="fake", data_asset_name="nowhere", batch_identifiers=IDDict({"element": "Hydrogen"}), ) returned_partition_key = my_custom_sorter.get_batch_key(test_batch_def) assert returned_partition_key == 0 # This element does not : Vibranium test_batch_def = BatchDefinition( datasource_name="test", data_connector_name="fake", data_asset_name="nowhere", batch_identifiers=IDDict({"element": "Vibranium"}), ) with pytest.raises(ge_exceptions.SorterError): my_custom_sorter.get_batch_key(test_batch_def)
def test_batch_definition_id(): A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) print(A.id) B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) print(B.id) assert A.id != B.id
def test_batch_definition_instantiation(): with pytest.raises(TypeError): # noinspection PyTypeChecker,PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", {"id": "A"}) A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"})) print(A.id)
def test_map_data_reference_string_to_batch_definition_list_using_regex(): # regex_pattern does not match --> None data_reference = "alex_20200809_1000.csv" regex_pattern = r"^(.+)_____________\.csv$" group_names = ["name", "timestamp", "price"] returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name=None, data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) assert returned_batch_def_list is None # no data_asset_name configured --> DEFAULT_ASSET_NAME data_reference = "alex_20200809_1000.csv" regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name=None, data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) assert returned_batch_def_list == [ BatchDefinition( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name="DEFAULT_ASSET_NAME", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000",} ), ) ] # data_asset_name configured returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name="test_data_asset", data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) assert returned_batch_def_list == [ BatchDefinition( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name="test_data_asset", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000",} ), ) ]
def test_data_connector_query_sorted_filtered_by_custom_filter_with_index_as_slice_via_string_no_left_right_no_step( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter. def my_custom_batch_selector(batch_identifiers: dict) -> bool: return (batch_identifiers["name"] in ["abe", "james", "eugene"] and datetime.datetime.strptime(batch_identifiers["timestamp"], "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date()) returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={ "custom_filter_function": my_custom_batch_selector, "index": ":3", }, ))) assert len(returned_batch_definition_list) == 3 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), ] assert returned_batch_definition_list == expected
def test_batch_definition_equality(): A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) assert A != B A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) assert A == A2
def test_batch_definition_id(): # noinspection PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"})) print(A.id) # noinspection PyUnusedLocal,PyPep8Naming B = BatchDefinition("B", "b", "bbb", batch_identifiers=IDDict({"id": "B"})) print(B.id) assert A.id != B.id
def test_map_batch_definition_to_data_reference_string_using_regex(): # not BatchDefinition my_batch_definition = "I_am_a_string" group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(TypeError): # noinspection PyUnusedLocal,PyTypeChecker my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # group names do not match my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["i", "wont", "match"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(KeyError): # noinspection PyUnusedLocal my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # success my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) assert my_data_reference == "eugene_20200809_1500.csv"
def test_date_time(): first = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"date": "20210101"}), ) second = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"date": "20210102"}), ) third = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"date": "20210103"}), ) batch_list = [first, second, third] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [third, second, first] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [first, second, third] with pytest.raises(ge_exceptions.SorterError): # numeric date_time_format i_dont_work = DateTimeSorter(name="date", datetime_format=12345, orderby="desc") my_date_is_not_a_string = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"date": 20210103}), ) batch_list = [first, second, third, my_date_is_not_a_string] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="desc") with pytest.raises(ge_exceptions.SorterError): sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( test_s3_files, test_df_small ): bucket, _keys = test_s3_files expected_df = test_df_small execution_engine: ExecutionEngine = PandasExecutionEngine() my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", bucket=bucket, execution_engine=execution_engine, prefix="", assets={"alpha": {}}, default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, ) batch_def: BatchDefinition = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=1), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) test_df = execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def) ) assert test_df.dataframe.shape == expected_df.shape # if key does not exist batch_def_no_key = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=9), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec( batch_definition=batch_def_no_key ) )
def test_batch_definition_equality(): # noinspection PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) # noinspection PyUnusedLocal,PyPep8Naming B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) assert A != B # noinspection PyUnusedLocal,PyPep8Naming A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) assert A == A2
def test_partition_request_sorted_filtered_by_custom_filter_with_slice_as_list( create_files_and_instantiate_data_connector, ): # <TODO> is this behavior correct? my_data_connector = create_files_and_instantiate_data_connector # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter. def my_custom_partition_selector(partition_definition: dict) -> bool: return (partition_definition["name"] in ["abe", "james", "eugene"] and datetime.datetime.strptime( partition_definition["timestamp"], "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date()) returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request={ "custom_filter_function": my_custom_partition_selector, "index": [1, 3], }, ))) assert len(returned_batch_definition_list) == 2 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), ] assert returned_batch_definition_list == expected
def test_data_connector_query_sorted_filtered_by_custom_filter_with_slice_obj( create_files_and_instantiate_data_connector, ): # <TODO> is this behavior correct? my_data_connector = create_files_and_instantiate_data_connector # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter. def my_custom_batch_selector(batch_identifiers: dict) -> bool: return (batch_identifiers["name"] in ["abe", "james", "eugene"] and datetime.datetime.strptime(batch_identifiers["timestamp"], "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date()) returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={ "custom_filter_function": my_custom_batch_selector, "index": slice(3, 5, None), }, ))) assert len(returned_batch_definition_list) == 2 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200811", "price": "1009" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200810", "price": "1003" }), ), ] assert returned_batch_definition_list == expected
def test__build_batch_spec(basic_datasource): partition_request: dict = { "batch_identifiers": { "custom_key_0": "staging", "airflow_run_id": 1234567890, } } test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"] ) # noinspection PyProtectedMember batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", partition_definition=PartitionDefinition( partition_request["batch_identifiers"] ), ), batch_data=pd.DataFrame({"x": range(10)}), ) assert type(batch_spec) == RuntimeDataBatchSpec assert set(batch_spec.keys()) == {"batch_data"} assert batch_spec["batch_data"].shape == (10, 1)
def test__generate_batch_spec_parameters_from_batch_definition( basic_datasource, ): partition_request: dict = { "batch_identifiers": { "custom_key_0": "staging", "airflow_run_id": 1234567890, } } test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"] ) expected_batch_spec_parameters: dict = {} # noinspection PyProtectedMember batch_spec_parameters: dict = test_runtime_data_connector._generate_batch_spec_parameters_from_batch_definition( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", partition_definition=PartitionDefinition( partition_request["batch_identifiers"] ), ) ) assert batch_spec_parameters == expected_batch_spec_parameters
def test_asset_is_name_batch_identifier_correctly_used( basic_datasource_with_assets, test_df_pandas): """ Using asset_a, which is named in the RuntimeDataConnector configuration, and using batch_identifier that is named. """ runtime_data_connector: RuntimeDataConnector = ( basic_datasource_with_assets.data_connectors["runtime"]) res: List[ BatchDefinition] = runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name=basic_datasource_with_assets.name, data_connector_name="runtime", data_asset_name="asset_a", batch_identifiers={ "month": 4, "day": 1 }, runtime_parameters={"batch_data": test_df_pandas}, )) assert len(res) == 1 assert res[0] == BatchDefinition( datasource_name="my_datasource", data_connector_name="runtime", data_asset_name="asset_a", batch_identifiers=IDDict({ "month": 4, "day": 1 }), )
def test_data_connector_query_data_connector_query_batch_identifiers_2_key_name_timestamp( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # no limit returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={ "batch_filter_parameters": { "timestamp": "20200809", "name": "will", }, }, ))) assert len(returned_batch_definition_list) == 1 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), ] assert returned_batch_definition_list == expected
def map_data_reference_string_to_batch_definition_list_using_regex( datasource_name: str, data_connector_name: str, data_reference: str, regex_pattern: str, group_names: List[str], data_asset_name: Optional[str] = None, ) -> Optional[List[BatchDefinition]]: processed_data_reference: Optional[Tuple[ str, PartitionDefinitionSubset]] = convert_data_reference_string_to_partition_definition_using_regex( data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) if processed_data_reference is None: return None data_asset_name_from_partition_definition: str = processed_data_reference[ 0] partition_definition: PartitionDefinitionSubset = processed_data_reference[ 1] if data_asset_name is None: data_asset_name = data_asset_name_from_partition_definition return [ BatchDefinition( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, partition_definition=PartitionDefinition(partition_definition), ) ]
def map_data_reference_string_to_batch_definition_list_using_regex( datasource_name: str, data_connector_name: str, data_reference: str, regex_pattern: str, group_names: List[str], data_asset_name: Optional[str] = None, ) -> Optional[List[BatchDefinition]]: processed_data_reference: Optional[Tuple[ str, IDDict]] = convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) if processed_data_reference is None: return None data_asset_name_from_batch_identifiers: str = processed_data_reference[0] batch_identifiers: IDDict = processed_data_reference[1] if data_asset_name is None: data_asset_name = data_asset_name_from_batch_identifiers return [ BatchDefinition( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, batch_identifiers=IDDict(batch_identifiers), ) ]
def batch_fixture() -> Batch: """ Fixture for Batch object that contains data, BatchRequest, BatchDefinition as well as BatchSpec and BatchMarkers. To be used in unittesting. """ df: pd.DataFrame = pd.DataFrame( {"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]} ) batch_request: BatchRequest = BatchRequest( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", ) batch_definition: BatchDefinition = BatchDefinition( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", batch_identifiers=IDDict({"id": "A"}), ) batch_spec: BatchSpec = BatchSpec(path="/some/path/some.file") batch_markers: BatchMarkers = BatchMarkers(ge_load_time="FAKE_LOAD_TIME") batch: Batch = Batch( data=df, batch_request=batch_request, batch_definition=batch_definition, batch_spec=batch_spec, batch_markers=batch_markers, ) return batch
def ge_validator_sqlalchemy() -> Validator: validator = Validator( execution_engine=SqlAlchemyExecutionEngine( connection_string="postgresql://localhost:5432/test"), batches=[ Batch( data=None, batch_request=BatchRequest( datasource_name="my_postgresql_datasource", data_connector_name="whole_table", data_asset_name="foo2", ), batch_definition=BatchDefinition( datasource_name="my_postgresql_datasource", data_connector_name="whole_table", data_asset_name="foo2", batch_identifiers=IDDict(), ), batch_spec=SqlAlchemyDatasourceBatchSpec({ "data_asset_name": "foo2", "table_name": "foo2", "batch_identifiers": {}, "schema_name": "public", "type": "table", }), ) ], ) return validator
def test_partition_request_partition_request_partition_identifiers_1_key_and_index( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # no limit returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request={ "partition_identifiers": { "name": "james" }, "index": 0, }, ))) assert len(returned_batch_definition_list) == 1 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "james", "timestamp": "20200713", "price": "1567" }), ), ] assert returned_batch_definition_list == expected
def test_batch__str__method(): batch = Batch( data=None, batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", ), batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", batch_identifiers=IDDict({}), ), batch_spec=BatchSpec(path="/some/path/some.file"), batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"), ) print(batch.__str__()) assert (batch.__str__() == """{ "data": "None", "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name" }, "batch_definition": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name", "batch_identifiers": {} }, "batch_spec": "{'path': '/some/path/some.file'}", "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}" }""")
def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest): self._validate_batch_request(batch_request=batch_request) if self._data_references_cache is None: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = [] try: sub_cache = self._data_references_cache[batch_request.data_asset_name] except KeyError as e: raise KeyError( f"data_asset_name {batch_request.data_asset_name} is not recognized." ) for partition_definition in sub_cache: batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, partition_definition=PartitionDefinition(partition_definition), ) if batch_definition_matches_batch_request(batch_definition, batch_request): batch_definition_list.append(batch_definition) return batch_definition_list
def build_batch_spec(self, batch_definition: BatchDefinition) -> PathBatchSpec: """ Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function. Args: batch_definition (BatchDefinition): to be used to build batch_spec Returns: BatchSpec built from batch_definition """ data_asset_name: str = batch_definition.data_asset_name if ( data_asset_name in self.assets and self.assets[data_asset_name].batch_spec_passthrough and isinstance(self.assets[data_asset_name].batch_spec_passthrough, dict) ): # batch_spec_passthrough from data_asset batch_spec_passthrough = deepcopy( self.assets[data_asset_name]["batch_spec_passthrough"] ) batch_definition_batch_spec_passthrough = ( deepcopy(batch_definition.batch_spec_passthrough) or {} ) # batch_spec_passthrough from Batch Definition supersedes batch_spec_passthrough from data_asset batch_spec_passthrough.update(batch_definition_batch_spec_passthrough) batch_definition.batch_spec_passthrough = batch_spec_passthrough batch_spec: PathBatchSpec = super().build_batch_spec( batch_definition=batch_definition ) return batch_spec
def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest): self._validate_batch_request(batch_request=batch_request) if len(self._data_references_cache) == 0: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = [] try: sub_cache = self._data_references_cache[batch_request.data_asset_name] except KeyError as e: raise KeyError( f"data_asset_name {batch_request.data_asset_name} is not recognized." ) for batch_identifiers in sub_cache: batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, batch_identifiers=IDDict(batch_identifiers), batch_spec_passthrough=batch_request.batch_spec_passthrough, ) if batch_definition_matches_batch_request(batch_definition, batch_request): batch_definition_list.append(batch_definition) return batch_definition_list