def test__get_data_reference_name(basic_datasource):
    partition_request: dict = {
        "batch_identifiers": {
            "airflow_run_id": 1234567890,
        }
    }
    partition_definition = PartitionDefinition(
        partition_request["batch_identifiers"])

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])

    assert (test_runtime_data_connector._get_data_reference_name(
        partition_definition) == "1234567890")

    partition_request: dict = {
        "batch_identifiers": {
            "run_id_1": 1234567890,
            "run_id_2": 1111111111,
        }
    }
    partition_definition = PartitionDefinition(
        partition_request["batch_identifiers"])

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])

    assert (test_runtime_data_connector._get_data_reference_name(
        partition_definition) == "1234567890-1111111111")
def test_sorter_instantiation_custom_list_with_periodic_table(
    periodic_table_of_elements, ):
    # CustomListSorter
    sorter_params: dict = {
        "reference_list": periodic_table_of_elements,
    }
    my_custom_sorter = CustomListSorter(name="element",
                                        orderby="asc",
                                        **sorter_params)
    # noinspection PyProtectedMember
    assert my_custom_sorter._reference_list == periodic_table_of_elements
    # This element exists : Hydrogen
    test_batch_def = BatchDefinition(
        datasource_name="test",
        data_connector_name="fake",
        data_asset_name="nowhere",
        partition_definition=PartitionDefinition({"element": "Hydrogen"}),
    )
    returned_partition_key = my_custom_sorter.get_partition_key(test_batch_def)
    assert returned_partition_key == 0

    # This element does not : Vibranium
    test_batch_def = BatchDefinition(
        datasource_name="test",
        data_connector_name="fake",
        data_asset_name="nowhere",
        partition_definition=PartitionDefinition({"element": "Vibranium"}),
    )
    with pytest.raises(ge_exceptions.SorterError):
        my_custom_sorter.get_partition_key(test_batch_def)
def test__build_batch_spec(basic_datasource):
    partition_request: dict = {
        "batch_identifiers": {
            "custom_key_0": "staging",
            "airflow_run_id": 1234567890,
        }
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    # noinspection PyProtectedMember
    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="my_data_asset",
            partition_definition=PartitionDefinition(
                partition_request["batch_identifiers"]
            ),
        ),
        batch_data=pd.DataFrame({"x": range(10)}),
    )
    assert type(batch_spec) == RuntimeDataBatchSpec
    assert set(batch_spec.keys()) == {"batch_data"}
    assert batch_spec["batch_data"].shape == (10, 1)
def test__generate_batch_spec_parameters_from_batch_definition(
    basic_datasource,
):
    partition_request: dict = {
        "batch_identifiers": {
            "custom_key_0": "staging",
            "airflow_run_id": 1234567890,
        }
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    expected_batch_spec_parameters: dict = {}

    # noinspection PyProtectedMember
    batch_spec_parameters: dict = test_runtime_data_connector._generate_batch_spec_parameters_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="my_data_asset",
            partition_definition=PartitionDefinition(
                partition_request["batch_identifiers"]
            ),
        )
    )

    assert batch_spec_parameters == expected_batch_spec_parameters
Esempio n. 5
0
def map_data_reference_string_to_batch_definition_list_using_regex(
    datasource_name: str,
    data_connector_name: str,
    data_reference: str,
    regex_pattern: str,
    group_names: List[str],
    data_asset_name: Optional[str] = None,
) -> Optional[List[BatchDefinition]]:
    processed_data_reference: Optional[Tuple[
        str,
        PartitionDefinitionSubset]] = convert_data_reference_string_to_partition_definition_using_regex(
            data_reference=data_reference,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )
    if processed_data_reference is None:
        return None
    data_asset_name_from_partition_definition: str = processed_data_reference[
        0]
    partition_definition: PartitionDefinitionSubset = processed_data_reference[
        1]
    if data_asset_name is None:
        data_asset_name = data_asset_name_from_partition_definition

    return [
        BatchDefinition(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
            partition_definition=PartitionDefinition(partition_definition),
        )
    ]
Esempio n. 6
0
    def get_batch_definition_list_from_batch_request(
        self,
        batch_request: BatchRequest,
    ) -> List[BatchDefinition]:
        self._validate_batch_request(batch_request=batch_request)

        partition_identifiers: Optional[dict] = None
        if batch_request.partition_request:
            self._validate_partition_identifiers(
                partition_identifiers=batch_request.partition_request.get(
                    "partition_identifiers"))
            partition_identifiers = batch_request.partition_request.get(
                "partition_identifiers")
        if not partition_identifiers:
            partition_identifiers = {}

        batch_definition_list: List[BatchDefinition]

        batch_definition: BatchDefinition = BatchDefinition(
            datasource_name=self.datasource_name,
            data_connector_name=self.name,
            data_asset_name=DEFAULT_DATA_ASSET_NAME,
            partition_definition=PartitionDefinition(partition_identifiers),
        )

        if batch_definition_matches_batch_request(
                batch_definition=batch_definition,
                batch_request=batch_request):
            batch_definition_list = [batch_definition]
        else:
            batch_definition_list = []

        return batch_definition_list
Esempio n. 7
0
def map_data_reference_string_to_batch_definition_list_using_regex(
    datasource_name: str,
    data_connector_name: str,
    data_asset_name: str,
    data_reference: str,
    regex_pattern: str,
    group_names: List[str],
) -> Optional[List[BatchDefinition]]:
    batch_request: BatchRequest = (
        convert_data_reference_string_to_batch_request_using_regex(
            data_reference=data_reference,
            regex_pattern=regex_pattern,
            group_names=group_names,
        ))
    if batch_request is None:
        return None

    if data_asset_name is None:
        data_asset_name = batch_request.data_asset_name

    return [
        BatchDefinition(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
            partition_definition=PartitionDefinition(
                batch_request.partition_request),
        )
    ]
    def _get_batch_definition_list_from_batch_request(
        self,
        batch_request: BatchRequest,
    ) -> List[BatchDefinition]:
        """
        <Will> 202103. The following behavior of the _data_references_cache follows a pattern that we are using for
        other data_connectors, including variations of FilePathDataConnector. When BatchRequest contains batch_data
        that is passed in as a in-memory dataframe, the cache will contain the names of all data_assets
        (and data_references) that have been passed into the RuntimeDataConnector in this session, even though technically
        only the most recent batch_data is available. This can be misleading. However, allowing the RuntimeDataConnector
        to keep a record of all data_assets (and data_references) that have been passed in will allow for the proposed
        behavior of RuntimeBatchRequest which will allow for paths and queries to be passed in as part of the BatchRequest.
        Therefore this behavior will be revisited when the design of RuntimeBatchRequest and related classes are complete.
        """
        self._validate_batch_request(batch_request=batch_request)

        batch_identifiers = batch_request.partition_request.get("batch_identifiers")

        self._validate_batch_identifiers(batch_identifiers=batch_identifiers)

        batch_definition_list: List[BatchDefinition]
        batch_definition: BatchDefinition = BatchDefinition(
            datasource_name=self.datasource_name,
            data_connector_name=self.name,
            data_asset_name=batch_request.data_asset_name,
            partition_definition=PartitionDefinition(batch_identifiers),
        )
        batch_definition_list = [batch_definition]
        self._update_data_references_cache(
            batch_request.data_asset_name, batch_definition_list, batch_identifiers
        )
        return batch_definition_list
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector(
        test_s3_files, test_df_small):
    bucket, _keys = test_s3_files
    expected_df = test_df_small

    my_data_connector = ConfiguredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
        bucket=bucket,
        prefix="",
        assets={"alpha": {}},
    )
    batch_def = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_asset_name="alpha",
        data_connector_name="my_data_connector",
        partition_definition=PartitionDefinition(index=1),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    test_df = PandasExecutionEngine().get_batch_data(
        batch_spec=my_data_connector.build_batch_spec(
            batch_definition=batch_def))
    assert test_df.dataframe.shape == expected_df.shape

    # if key does not exist
    batch_def_no_key = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_asset_name="alpha",
        data_connector_name="my_data_connector",
        partition_definition=PartitionDefinition(index=9),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    with pytest.raises(ClientError):
        PandasExecutionEngine().get_batch_data(
            batch_spec=my_data_connector.build_batch_spec(
                batch_definition=batch_def_no_key))
 def _map_data_reference_to_batch_definition_list(
     self, data_reference: str, data_asset_name: Optional[str] = None
 ) -> Optional[List[BatchDefinition]]:
     if data_asset_name is None:
         data_asset_name = DEFAULT_DATA_ASSET_NAME
     return [
         BatchDefinition(
             datasource_name=self.datasource_name,
             data_connector_name=self.name,
             data_asset_name=data_asset_name,
             partition_definition=PartitionDefinition(),
         )
     ]
def test_get_batch_definition_list_from_batch_request_length_one(
    basic_datasource,
):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    partition_request: dict = {
        "batch_identifiers": {
            "airflow_run_id": 1234567890,
        }
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "IN_MEMORY_DATA_ASSET",
        "batch_data": test_df,
        "partition_request": partition_request,
        "limit": None,
    }
    batch_request: BatchRequest = BatchRequest(**batch_request)

    expected_batch_definition_list: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="IN_MEMORY_DATA_ASSET",
            partition_definition=PartitionDefinition(
                partition_request["batch_identifiers"]
            ),
        )
    ]

    batch_definition_list: List[
        BatchDefinition
    ] = test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request
    )

    assert batch_definition_list == expected_batch_definition_list
def test_data_references_cache_updating_after_batch_request(
    basic_datasource, ):
    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # empty if data_connector has not been used
    assert test_runtime_data_connector.get_available_data_asset_names() == []

    partition_request: dict = {
        "batch_identifiers": {
            "airflow_run_id": 1234567890,
        }
    }
    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_1",
        "batch_data": test_df,
        "partition_request": partition_request,
        "limit": None,
    }
    batch_request: BatchRequest = BatchRequest(**batch_request)

    # run with my_data_asset_1
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request)

    assert test_runtime_data_connector._data_references_cache == {
        "my_data_asset_1": {
            "1234567890": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    partition_definition=PartitionDefinition(
                        {"airflow_run_id": 1234567890}),
                )
            ],
        }
    }

    # update with
    test_df_new: pd.DataFrame = pd.DataFrame(data={
        "col1": [5, 6],
        "col2": [7, 8]
    })
    partition_request: dict = {
        "batch_identifiers": {
            "airflow_run_id": 987654321,
        }
    }

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_1",
        "batch_data": test_df_new,
        "partition_request": partition_request,
        "limit": None,
    }
    batch_request: BatchRequest = BatchRequest(**batch_request)

    # run with with new_data_asset but a new batch
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request)

    assert test_runtime_data_connector._data_references_cache == {
        "my_data_asset_1": {
            "1234567890": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    partition_definition=PartitionDefinition(
                        {"airflow_run_id": 1234567890}),
                )
            ],
            "987654321": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    partition_definition=PartitionDefinition(
                        {"airflow_run_id": 987654321}),
                )
            ],
        },
    }

    # new data_asset_name
    test_df_new_asset: pd.DataFrame = pd.DataFrame(data={
        "col1": [9, 10],
        "col2": [11, 12]
    })
    partition_request: dict = {
        "batch_identifiers": {
            "airflow_run_id": 5555555,
        }
    }

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_2",
        "batch_data": test_df_new_asset,
        "partition_request": partition_request,
        "limit": None,
    }
    batch_request: BatchRequest = BatchRequest(**batch_request)

    # run with with new_data_asset but a new batch
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request)

    assert test_runtime_data_connector._data_references_cache == {
        "my_data_asset_1": {
            "1234567890": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    partition_definition=PartitionDefinition(
                        {"airflow_run_id": 1234567890}),
                )
            ],
            "987654321": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    partition_definition=PartitionDefinition(
                        {"airflow_run_id": 987654321}),
                )
            ],
        },
        "my_data_asset_2": {
            "5555555": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_2",
                    partition_definition=PartitionDefinition(
                        {"airflow_run_id": 5555555}),
                )
            ]
        },
    }

    assert test_runtime_data_connector.get_available_data_asset_names() == [
        "my_data_asset_1",
        "my_data_asset_2",
    ]

    assert test_runtime_data_connector.get_data_reference_list_count() == 3