def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest): self._validate_batch_request(batch_request=batch_request) if len(self._data_references_cache) == 0: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = [] try: sub_cache = self._data_references_cache[batch_request.data_asset_name] except KeyError as e: raise KeyError( f"data_asset_name {batch_request.data_asset_name} is not recognized." ) for batch_identifiers in sub_cache: batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, batch_identifiers=IDDict(batch_identifiers), batch_spec_passthrough=batch_request.batch_spec_passthrough, ) if batch_definition_matches_batch_request(batch_definition, batch_request): batch_definition_list.append(batch_definition) return batch_definition_list
def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest): self._validate_batch_request(batch_request=batch_request) if self._data_references_cache is None: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = [] try: sub_cache = self._data_references_cache[batch_request.data_asset_name] except KeyError as e: raise KeyError( f"data_asset_name {batch_request.data_asset_name} is not recognized." ) for partition_definition in sub_cache: batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, partition_definition=PartitionDefinition(partition_definition), ) if batch_definition_matches_batch_request(batch_definition, batch_request): batch_definition_list.append(batch_definition) return batch_definition_list
def get_batch_definition_list_from_batch_request( self, batch_request: BatchRequest, ) -> List[BatchDefinition]: self._validate_batch_request(batch_request=batch_request) partition_identifiers: Optional[dict] = None if batch_request.partition_request: self._validate_partition_identifiers( partition_identifiers=batch_request.partition_request.get( "partition_identifiers")) partition_identifiers = batch_request.partition_request.get( "partition_identifiers") if not partition_identifiers: partition_identifiers = {} batch_definition_list: List[BatchDefinition] batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=DEFAULT_DATA_ASSET_NAME, partition_definition=PartitionDefinition(partition_identifiers), ) if batch_definition_matches_batch_request( batch_definition=batch_definition, batch_request=batch_request): batch_definition_list = [batch_definition] else: batch_definition_list = [] return batch_definition_list
def get_batch_definition_list_from_batch_request( self, batch_request: BatchRequest, ) -> List[BatchDefinition]: self._validate_batch_request(batch_request=batch_request) if self._data_references_cache is None: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = list( filter( lambda batch_definition: batch_definition_matches_batch_request( batch_definition=batch_definition, batch_request=batch_request), self._get_batch_definition_list_from_cache(), )) if batch_request.partition_request is not None: partition_query_obj: PartitionQuery = build_partition_query( partition_request_dict=batch_request.partition_request) batch_definition_list = partition_query_obj.select_from_partition_request( batch_definition_list=batch_definition_list) if len(self.sorters) > 0: sorted_batch_definition_list = self._sort_batch_definition_list( batch_definition_list=batch_definition_list) return sorted_batch_definition_list else: return batch_definition_list
def _get_batch_definition_list_from_batch_request( self, batch_request: BatchRequestBase, ) -> List[BatchDefinition]: """ Retrieve batch_definitions that match batch_request. First retrieves all batch_definitions that match batch_request - if batch_request also has a batch_filter, then select batch_definitions that match batch_filter. - if data_connector has sorters configured, then sort the batch_definition list before returning. Args: batch_request (BatchRequestBase): BatchRequestBase (BatchRequest without attribute validation) to process Returns: A list of BatchDefinition objects that match BatchRequest """ self._validate_batch_request(batch_request=batch_request) if len(self._data_references_cache) == 0: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = list( filter( lambda batch_definition: batch_definition_matches_batch_request( batch_definition=batch_definition, batch_request=batch_request), self._get_batch_definition_list_from_cache(), )) if len(self.sorters) > 0: batch_definition_list = self._sort_batch_definition_list( batch_definition_list=batch_definition_list) if batch_request.data_connector_query is not None: data_connector_query_dict = batch_request.data_connector_query.copy( ) if (batch_request.limit is not None and data_connector_query_dict.get("limit") is None): data_connector_query_dict["limit"] = batch_request.limit batch_filter_obj: BatchFilter = build_batch_filter( data_connector_query_dict=data_connector_query_dict) batch_definition_list = batch_filter_obj.select_from_data_connector_query( batch_definition_list=batch_definition_list) return batch_definition_list
def _get_batch_definition_list_from_batch_request( self, batch_request: BatchRequestBase, ) -> List[BatchDefinition]: """ Retrieve batch_definitions that match batch_request. First retrieves all batch_definitions that match batch_request - if batch_request also has a partition_query, then select batch_definitions that match partition_query. - if data_connector has sorters configured, then sort the batch_definition list before returning. Args: batch_request (BatchRequestBase): BatchRequestBase (BatchRequest without attribute validation) to process Returns: A list of BatchDefinition objects that match BatchRequest """ self._validate_batch_request(batch_request=batch_request) if self._data_references_cache is None: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = list( filter( lambda batch_definition: batch_definition_matches_batch_request( batch_definition=batch_definition, batch_request=batch_request ), self._get_batch_definition_list_from_cache(), ) ) if batch_request.partition_request is not None: partition_query_obj: PartitionQuery = build_partition_query( partition_request_dict=batch_request.partition_request ) batch_definition_list = partition_query_obj.select_from_partition_request( batch_definition_list=batch_definition_list ) if len(self.sorters) > 0: sorted_batch_definition_list = self._sort_batch_definition_list( batch_definition_list=batch_definition_list ) return sorted_batch_definition_list else: return batch_definition_list
def test_batch_definition_matches_batch_request(): my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) # fully matching_batch_request my_batch_request = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query=None, ) assert (batch_definition_matches_batch_request(my_batch_definition, my_batch_request) is True) # execution environment doesn't match my_batch_request = BatchRequest( datasource_name="i_dont_match", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query=None, ) assert (batch_definition_matches_batch_request(my_batch_definition, my_batch_request) is False) # data_connector_name doesn't match my_batch_request = BatchRequest( datasource_name="test_environment", data_connector_name="i_dont_match", data_asset_name="TestFiles", data_connector_query=None, ) assert (batch_definition_matches_batch_request(my_batch_definition, my_batch_request) is False) # data_asset_name doesn't match my_batch_request = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="i_dont_match", data_connector_query=None, ) assert (batch_definition_matches_batch_request(my_batch_definition, my_batch_request) is False) # batch_request.data_connector_query.batch_filter_parameters is not dict my_batch_request = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={"batch_filter_parameters": 1}, ) assert (batch_definition_matches_batch_request(my_batch_definition, my_batch_request) is False) # batch_identifiers do not match batch_definition.batch_identifiers my_batch_request = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={"batch_filter_parameters": { "i": "wont_work" }}, ) assert (batch_definition_matches_batch_request(my_batch_definition, my_batch_request) is False)
def test__batch_definition_matches_batch_request(): # TODO: <Alex>We need to cleanup PyCharm warnings.</Alex> A = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_definition=PartitionDefinition({ "id": "A", }), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest(datasource_name="A")) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest(datasource_name="B")) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest( datasource_name="A", data_connector_name="a", ), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest( datasource_name="A", data_connector_name="a", data_asset_name="aaa", ), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest( datasource_name="A", data_connector_name="a", data_asset_name="bbb", ), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_request={ "partition_identifiers": { "id": "B" }, }, ), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequest(partition_request={ "partition_identifiers": { "id": "A" }, }), ) assert batch_definition_matches_batch_request( batch_definition=BatchDefinition( **{ "datasource_name": "FAKE_DATASOURCE", "data_connector_name": "TEST_DATA_CONNECTOR", "data_asset_name": "DEFAULT_ASSET_NAME", "partition_definition": PartitionDefinition({"index": "3"}), }), batch_request=BatchRequest( **{ "datasource_name": "FAKE_DATASOURCE", "data_connector_name": "TEST_DATA_CONNECTOR", "data_asset_name": "DEFAULT_ASSET_NAME", "partition_request": None, }), )
def test__batch_definition_matches_batch_request(): # TODO: <Alex>We need to cleanup PyCharm warnings.</Alex> A = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({ "id": "A", }), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(datasource_name="A", data_connector_name="", data_asset_name=""), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="B", data_connector_name="", data_asset_name=""), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(datasource_name="A", data_connector_name="a", data_asset_name=""), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(datasource_name="A", data_connector_name="a", data_asset_name="aaa"), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(datasource_name="A", data_connector_name="a", data_asset_name="bbb"), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="A", data_connector_name="a", data_asset_name="aaa", data_connector_query={ "batch_filter_parameters": { "id": "B" }, }, ), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="", data_connector_name="", data_asset_name="", data_connector_query={ "batch_filter_parameters": { "id": "A" }, }, ), ) assert batch_definition_matches_batch_request( batch_definition=BatchDefinition( **{ "datasource_name": "FAKE_DATASOURCE", "data_connector_name": "TEST_DATA_CONNECTOR", "data_asset_name": "DEFAULT_ASSET_NAME", "batch_identifiers": IDDict({"index": "3"}), }), batch_request=BatchRequest( **{ "datasource_name": "FAKE_DATASOURCE", "data_connector_name": "TEST_DATA_CONNECTOR", "data_asset_name": "DEFAULT_ASSET_NAME", "data_connector_query": None, }), )