Example #1
0
def test__get_data_reference_name(basic_datasource):
    data_connector_query: dict = {
        "batch_filter_parameters": {
            "airflow_run_id": 1234567890,
        }
    }
    batch_identifiers = IDDict(data_connector_query["batch_filter_parameters"])

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])

    assert (test_runtime_data_connector._get_data_reference_name(
        batch_identifiers) == "1234567890")

    data_connector_query: dict = {
        "batch_filter_parameters": {
            "run_id_1": 1234567890,
            "run_id_2": 1111111111,
        }
    }
    batch_identifiers = IDDict(data_connector_query["batch_filter_parameters"])

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])

    assert (test_runtime_data_connector._get_data_reference_name(
        batch_identifiers) == "1234567890-1111111111")
    def resolve_metric_bundle(
        self,
        metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Callable, dict]],
    ) -> dict:
        """For each metric name in the given metric_fn_bundle, finds the domain of the metric and calculates it using a
        metric function from the given provider class.

                Args:
                    metric_fn_bundle - A batch containing MetricEdgeKeys and their corresponding functions
                    metrics (dict) - A dictionary containing metrics and corresponding parameters

                Returns:
                    A dictionary of the collected metrics over their respective domains
        """

        resolved_metrics = dict()
        aggregates: Dict[Tuple, dict] = dict()
        for (
                metric_to_resolve,
                engine_fn,
                compute_domain_kwargs,
                accessor_domain_kwargs,
                metric_provider_kwargs,
        ) in metric_fn_bundle:
            if not isinstance(compute_domain_kwargs, IDDict):
                compute_domain_kwargs = IDDict(compute_domain_kwargs)
            domain_id = compute_domain_kwargs.to_id()
            if domain_id not in aggregates:
                aggregates[domain_id] = {
                    "column_aggregates": [],
                    "ids": [],
                    "domain_kwargs": compute_domain_kwargs,
                }
            aggregates[domain_id]["column_aggregates"].append(engine_fn)
            aggregates[domain_id]["ids"].append(metric_to_resolve.id)
        for aggregate in aggregates.values():
            compute_domain_kwargs = aggregate["domain_kwargs"]
            df, _, _ = self.get_compute_domain(compute_domain_kwargs,
                                               domain_type="identity")
            assert len(aggregate["column_aggregates"]) == len(aggregate["ids"])
            condition_ids = []
            aggregate_cols = []
            for idx in range(len(aggregate["column_aggregates"])):
                column_aggregate = aggregate["column_aggregates"][idx]
                aggregate_id = str(uuid.uuid4())
                condition_ids.append(aggregate_id)
                aggregate_cols.append(column_aggregate)
            res = df.agg(*aggregate_cols).collect()
            assert (
                len(res) == 1
            ), "all bundle-computed metrics must be single-value statistics"
            assert len(aggregate["ids"]) == len(
                res[0]), "unexpected number of metrics returned"
            logger.debug(
                f"SparkDFExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(compute_domain_kwargs).to_id()}"
            )
            for idx, id in enumerate(aggregate["ids"]):
                resolved_metrics[id] = res[0][idx]

        return resolved_metrics
def test_sorter_instantiation_custom_list_with_periodic_table(
    periodic_table_of_elements, ):
    # CustomListSorter
    sorter_params: dict = {
        "reference_list": periodic_table_of_elements,
    }
    my_custom_sorter = CustomListSorter(name="element",
                                        orderby="asc",
                                        **sorter_params)
    # noinspection PyProtectedMember
    assert my_custom_sorter._reference_list == periodic_table_of_elements
    # This element exists : Hydrogen
    test_batch_def = BatchDefinition(
        datasource_name="test",
        data_connector_name="fake",
        data_asset_name="nowhere",
        batch_identifiers=IDDict({"element": "Hydrogen"}),
    )
    returned_partition_key = my_custom_sorter.get_batch_key(test_batch_def)
    assert returned_partition_key == 0

    # This element does not : Vibranium
    test_batch_def = BatchDefinition(
        datasource_name="test",
        data_connector_name="fake",
        data_asset_name="nowhere",
        batch_identifiers=IDDict({"element": "Vibranium"}),
    )
    with pytest.raises(ge_exceptions.SorterError):
        my_custom_sorter.get_batch_key(test_batch_def)
Example #4
0
 def _get_data_reference_name(
     batch_identifiers: IDDict,
 ) -> str:
     if batch_identifiers is None:
         batch_identifiers = IDDict({})
     data_reference_name = DEFAULT_DELIMITER.join(
         [str(value) for value in batch_identifiers.values()]
     )
     return data_reference_name
Example #5
0
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector(
        test_s3_files, test_df_small):
    bucket, _keys = test_s3_files
    expected_df = test_df_small

    execution_engine: ExecutionEngine = PandasExecutionEngine()

    my_data_connector = ConfiguredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        bucket=bucket,
        execution_engine=execution_engine,
        prefix="",
        assets={"alpha": {}},
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
    )
    batch_def = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=1),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    test_df = execution_engine.get_batch_data(
        batch_spec=my_data_connector.build_batch_spec(
            batch_definition=batch_def))
    assert test_df.dataframe.shape == expected_df.shape

    # if key does not exist
    batch_def_no_key = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=9),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine.get_batch_data(
            batch_spec=my_data_connector.build_batch_spec(
                batch_definition=batch_def_no_key))
Example #6
0
def test__generate_batch_spec_parameters_from_batch_definition(
    basic_datasource,
):
    batch_identifiers = {
        "custom_key_0": "staging",
        "airflow_run_id": 1234567890,
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    expected_batch_spec_parameters: dict = {"data_asset_name": "my_data_asset"}

    # noinspection PyProtectedMember
    batch_spec_parameters: dict = test_runtime_data_connector._generate_batch_spec_parameters_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="my_data_asset",
            batch_identifiers=IDDict(batch_identifiers),
        )
    )

    assert batch_spec_parameters == expected_batch_spec_parameters
Example #7
0
def test_asset_is_name_batch_identifier_correctly_used(
        basic_datasource_with_assets, test_df_pandas):
    """
    Using asset_a, which is named in the RuntimeDataConnector configuration, and using batch_identifier that is named.
    """
    runtime_data_connector: RuntimeDataConnector = (
        basic_datasource_with_assets.data_connectors["runtime"])
    res: List[
        BatchDefinition] = runtime_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=RuntimeBatchRequest(
                datasource_name=basic_datasource_with_assets.name,
                data_connector_name="runtime",
                data_asset_name="asset_a",
                batch_identifiers={
                    "month": 4,
                    "day": 1
                },
                runtime_parameters={"batch_data": test_df_pandas},
            ))
    assert len(res) == 1
    assert res[0] == BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="runtime",
        data_asset_name="asset_a",
        batch_identifiers=IDDict({
            "month": 4,
            "day": 1
        }),
    )
Example #8
0
def map_data_reference_string_to_batch_definition_list_using_regex(
    datasource_name: str,
    data_connector_name: str,
    data_reference: str,
    regex_pattern: str,
    group_names: List[str],
    data_asset_name: Optional[str] = None,
) -> Optional[List[BatchDefinition]]:
    processed_data_reference: Optional[Tuple[
        str,
        IDDict]] = convert_data_reference_string_to_batch_identifiers_using_regex(
            data_reference=data_reference,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )
    if processed_data_reference is None:
        return None
    data_asset_name_from_batch_identifiers: str = processed_data_reference[0]
    batch_identifiers: IDDict = processed_data_reference[1]
    if data_asset_name is None:
        data_asset_name = data_asset_name_from_batch_identifiers

    return [
        BatchDefinition(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
            batch_identifiers=IDDict(batch_identifiers),
        )
    ]
Example #9
0
def batch_fixture() -> Batch:
    """
    Fixture for Batch object that contains data, BatchRequest, BatchDefinition
    as well as BatchSpec and BatchMarkers. To be used in unittesting.
    """
    df: pd.DataFrame = pd.DataFrame(
        {"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}
    )
    batch_request: BatchRequest = BatchRequest(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
    )
    batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
        batch_identifiers=IDDict({"id": "A"}),
    )
    batch_spec: BatchSpec = BatchSpec(path="/some/path/some.file")
    batch_markers: BatchMarkers = BatchMarkers(ge_load_time="FAKE_LOAD_TIME")
    batch: Batch = Batch(
        data=df,
        batch_request=batch_request,
        batch_definition=batch_definition,
        batch_spec=batch_spec,
        batch_markers=batch_markers,
    )
    return batch
def ge_validator_sqlalchemy() -> Validator:
    validator = Validator(
        execution_engine=SqlAlchemyExecutionEngine(
            connection_string="postgresql://localhost:5432/test"),
        batches=[
            Batch(
                data=None,
                batch_request=BatchRequest(
                    datasource_name="my_postgresql_datasource",
                    data_connector_name="whole_table",
                    data_asset_name="foo2",
                ),
                batch_definition=BatchDefinition(
                    datasource_name="my_postgresql_datasource",
                    data_connector_name="whole_table",
                    data_asset_name="foo2",
                    batch_identifiers=IDDict(),
                ),
                batch_spec=SqlAlchemyDatasourceBatchSpec({
                    "data_asset_name": "foo2",
                    "table_name": "foo2",
                    "batch_identifiers": {},
                    "schema_name": "public",
                    "type": "table",
                }),
            )
        ],
    )
    return validator
Example #11
0
def convert_data_reference_string_to_batch_identifiers_using_regex(
    data_reference: str,
    regex_pattern: str,
    group_names: List[str],
) -> Optional[Tuple[str, IDDict]]:
    # noinspection PyUnresolvedReferences
    pattern = re.compile(regex_pattern)
    matches: Optional[re.Match] = pattern.match(data_reference)
    if matches is None:
        return None

    # Check for `(?P<name>)` named group syntax
    match_dict = matches.groupdict()
    if match_dict:  # Only named groups will populate this dict
        batch_identifiers = _determine_batch_identifiers_using_named_groups(
            match_dict, group_names)
    else:
        groups: list = list(matches.groups())
        batch_identifiers: IDDict = IDDict(dict(zip(group_names, groups)))

    # TODO: <Alex>Accommodating "data_asset_name" inside batch_identifiers (e.g., via "group_names") is problematic; we need a better mechanism.</Alex>
    # TODO: <Alex>Update: Approach -- we can differentiate "def map_data_reference_string_to_batch_definition_list_using_regex(()" methods between ConfiguredAssetFilesystemDataConnector and InferredAssetFilesystemDataConnector so that IDDict never needs to include data_asset_name. (ref: https://superconductivedata.slack.com/archives/C01C0BVPL5Q/p1603843413329400?thread_ts=1603842470.326800&cid=C01C0BVPL5Q)</Alex>
    data_asset_name: str = batch_identifiers.pop("data_asset_name",
                                                 DEFAULT_DATA_ASSET_NAME)
    return data_asset_name, batch_identifiers
Example #12
0
def get_domain_metrics_dict_by_name(metrics: Dict[Tuple, Any],
                                    metric_domain_kwargs: IDDict):
    return {
        metric_edge_key_id_tuple[0]: metric_value
        for metric_edge_key_id_tuple, metric_value in metrics.items()
        if metric_edge_key_id_tuple[1] == metric_domain_kwargs.to_id()
    }
def build_batch_filter(data_connector_query_dict: Optional[Dict[
    str, Optional[Union[int, list, tuple, slice, str, Union[Dict, IDDict],
                        Callable, ]], ]] = None):
    if not data_connector_query_dict:
        return BatchFilter(
            custom_filter_function=None,
            batch_filter_parameters=None,
            index=None,
            limit=None,
        )
    data_connector_query_keys: set = set(data_connector_query_dict.keys())
    if not data_connector_query_keys <= BatchFilter.RECOGNIZED_KEYS:
        raise ge_exceptions.BatchFilterError(
            f"""Unrecognized data_connector_query key(s):
"{str(data_connector_query_keys - BatchFilter.RECOGNIZED_KEYS)}" detected.
            """)
    custom_filter_function: Callable = data_connector_query_dict.get(
        "custom_filter_function")
    if custom_filter_function and not isinstance(custom_filter_function,
                                                 Callable):
        raise ge_exceptions.BatchFilterError(
            f"""The type of a custom_filter must be a function (Python "Callable").  The type given is
"{str(type(custom_filter_function))}", which is illegal.
            """)
    batch_filter_parameters: Optional[dict] = data_connector_query_dict.get(
        "batch_filter_parameters")
    if batch_filter_parameters:
        if not isinstance(batch_filter_parameters, dict):
            raise ge_exceptions.BatchFilterError(
                f"""The type of batch_filter_parameters must be a dictionary (Python "dict").  The type given is
"{str(type(batch_filter_parameters))}", which is illegal.
                """)
        if not all(
            [isinstance(key, str) for key in batch_filter_parameters.keys()]):
            raise ge_exceptions.BatchFilterError(
                'All batch_filter_parameters keys must strings (Python "str").'
            )
    if batch_filter_parameters is not None:
        batch_filter_parameters: IDDict = IDDict(batch_filter_parameters)
    index: Optional[Union[int, list, tuple, slice,
                          str]] = data_connector_query_dict.get("index")
    limit: Optional[int] = data_connector_query_dict.get("limit")
    if limit and (not isinstance(limit, int) or limit < 0):
        raise ge_exceptions.BatchFilterError(
            f"""The type of a limit must be an integer (Python "int") that is greater than or equal to 0.  The
type and value given are "{str(type(limit))}" and "{limit}", respectively, which is illegal.
            """)
    if index is not None and limit is not None:
        raise ge_exceptions.BatchFilterError(
            "Only one of index or limit, but not both, can be specified (specifying both is illegal)."
        )
    index = _parse_index(index=index)
    return BatchFilter(
        custom_filter_function=custom_filter_function,
        batch_filter_parameters=batch_filter_parameters,
        index=index,
        limit=limit,
    )
Example #14
0
def get_metric_kwargs(
    metric_name: str,
    configuration: Optional["ExpectationConfiguration"] = None,
    runtime_configuration: Optional[dict] = None,
) -> Dict:
    try:
        metric_definition = _registered_metrics.get(metric_name)
        if metric_definition is None:
            raise ge_exceptions.MetricProviderError(
                f"No definition found for {metric_name}")
        default_kwarg_values = metric_definition["default_kwarg_values"]
        metric_kwargs = {
            "metric_domain_keys": metric_definition["metric_domain_keys"],
            "metric_value_keys": metric_definition["metric_value_keys"],
        }
        if configuration:
            expectation_impl = get_expectation_impl(
                configuration.expectation_type)
            configuration_kwargs = expectation_impl().get_runtime_kwargs(
                configuration=configuration,
                runtime_configuration=runtime_configuration)
            if len(metric_kwargs["metric_domain_keys"]) > 0:
                metric_domain_kwargs = IDDict({
                    k: configuration_kwargs.get(k)
                    or default_kwarg_values.get(k)
                    for k in metric_kwargs["metric_domain_keys"]
                })
            else:
                metric_domain_kwargs = IDDict()
            if len(metric_kwargs["metric_value_keys"]) > 0:
                metric_value_kwargs = IDDict({
                    k:
                    configuration_kwargs.get(k) if configuration_kwargs.get(k)
                    is not None else default_kwarg_values.get(k)
                    for k in metric_kwargs["metric_value_keys"]
                })
            else:
                metric_value_kwargs = IDDict()
            metric_kwargs["metric_domain_kwargs"] = metric_domain_kwargs
            metric_kwargs["metric_value_kwargs"] = metric_value_kwargs
        return metric_kwargs
    except KeyError:
        raise ge_exceptions.MetricProviderError(
            f"Incomplete definition found for {metric_name}")
    def _get_batch_definition_list_from_batch_request(
        self,
        batch_request: RuntimeBatchRequest,
    ) -> List[BatchDefinition]:
        """
        <Will> 202103. The following behavior of the _data_references_cache follows a pattern that we are using for
        other data_connectors, including variations of FilePathDataConnector. When BatchRequest contains batch_data
        that is passed in as a in-memory dataframe, the cache will contain the names of all data_assets
        (and data_references) that have been passed into the RuntimeDataConnector in this session, even though technically
        only the most recent batch_data is available. This can be misleading. However, allowing the RuntimeDataConnector
        to keep a record of all data_assets (and data_references) that have been passed in will allow for the proposed
        behavior of RuntimeBatchRequest which will allow for paths and queries to be passed in as part of the BatchRequest.
        Therefore this behavior will be revisited when the design of RuntimeBatchRequest and related classes are complete.
        """
        self._validate_batch_request(batch_request=batch_request)

        batch_identifiers: Optional[dict] = None
        if batch_request.batch_identifiers:
            self._validate_batch_identifiers(
                data_asset_name=batch_request.data_asset_name,
                batch_identifiers=batch_request.batch_identifiers,
            )
            batch_identifiers = batch_request.batch_identifiers

        if not batch_identifiers:
            ge_exceptions.DataConnectorError(
                "Passed in a RuntimeBatchRequest with no batch_identifiers")

        batch_definition_list: List[BatchDefinition]
        batch_definition: BatchDefinition = BatchDefinition(
            datasource_name=self.datasource_name,
            data_connector_name=self.name,
            data_asset_name=batch_request.data_asset_name,
            batch_identifiers=IDDict(batch_identifiers),
            batch_spec_passthrough=batch_request.batch_spec_passthrough,
        )
        batch_definition_list = [batch_definition]
        self._update_data_references_cache(
            batch_request.data_asset_name,
            batch_definition_list,
            IDDict(batch_identifiers),
        )
        return batch_definition_list
Example #16
0
 def __init__(
     self,
     metric_name: str,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: dict = None,
     metric_dependencies: dict = None,
 ):
     self._metric_name = metric_name
     if not isinstance(metric_domain_kwargs, IDDict):
         metric_domain_kwargs = IDDict(metric_domain_kwargs)
     self._metric_domain_kwargs = metric_domain_kwargs
     if not isinstance(metric_value_kwargs, IDDict):
         if metric_value_kwargs is None:
             metric_value_kwargs = dict()
         metric_value_kwargs = IDDict(metric_value_kwargs)
     self._metric_value_kwargs = metric_value_kwargs
     if metric_dependencies is None:
         metric_dependencies = dict()
     self.metric_dependencies = metric_dependencies
Example #17
0
def _determine_batch_identifiers_using_named_groups(
        match_dict: dict, group_names: List[str]) -> IDDict:
    batch_identifiers = IDDict()
    for key, value in match_dict.items():
        if key in group_names:
            batch_identifiers[key] = value
        else:
            logger.warning(
                f"The named group '{key}' must explicitly be stated in group_names to be parsed"
            )
    return batch_identifiers
Example #18
0
def test__build_batch_spec(basic_datasource):
    batch_identifiers = {
        "custom_key_0": "staging",
        "airflow_run_id": 1234567890,
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    batch_definition = BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="test_runtime_data_connector",
        data_asset_name="my_data_asset",
        batch_identifiers=IDDict(batch_identifiers),
    )

    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=batch_definition,
        runtime_parameters={
            "batch_data": pd.DataFrame({"x": range(10)}),
        },
    )
    assert type(batch_spec) == RuntimeDataBatchSpec
    assert set(batch_spec.keys()) == {"batch_data", "data_asset_name"}
    assert batch_spec["batch_data"].shape == (10, 1)

    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=batch_definition,
        runtime_parameters={
            "query": "my_query",
        },
    )
    assert type(batch_spec) == RuntimeQueryBatchSpec

    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=batch_definition, runtime_parameters={"path": "my_path"}
    )
    assert type(batch_spec) == PathBatchSpec

    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=batch_definition,
        runtime_parameters={"path": "s3://my.s3.path"},
    )
    assert type(batch_spec) == S3BatchSpec

    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=batch_definition,
        runtime_parameters={"path": "s3a://my.s3.path"},
    )
    assert type(batch_spec) == S3BatchSpec
 def from_object(cls, validation_result):
     batch_kwargs = validation_result.meta.get("batch_kwargs", {})
     if isinstance(batch_kwargs, IDDict):
         batch_identifier = batch_kwargs.to_id()
     elif isinstance(batch_kwargs, dict):
         batch_identifier = IDDict(batch_kwargs).to_id()
     else:
         raise DataContextError(
             "Unable to construct ValidationResultIdentifier from provided object."
         )
     return cls(
         expectation_suite_identifier=ExpectationSuiteIdentifier(
             validation_result.meta["expectation_suite_name"]),
         run_id=validation_result.meta.get("run_id"),
         batch_identifier=batch_identifier,
     )
Example #20
0
    def __init__(
        self,
        data,
        batch_request: BatchRequest = None,
        batch_definition: BatchDefinition = None,
        batch_spec: BatchSpec = None,
        batch_markers: BatchMarkers = None,
        # The remaining parameters are for backward compatibility.
        data_context=None,
        datasource_name=None,
        batch_parameters=None,
        batch_kwargs=None,
    ):
        self._data = data
        if batch_request is None:
            batch_request = dict()
        self._batch_request = batch_request
        if batch_definition is None:
            batch_definition = IDDict()
        self._batch_definition = batch_definition
        if batch_spec is None:
            batch_spec = BatchSpec()
        self._batch_spec = batch_spec

        if batch_markers is None:
            batch_markers = BatchMarkers(
                {
                    "ge_load_time": datetime.datetime.now(
                        datetime.timezone.utc
                    ).strftime("%Y%m%dT%H%M%S.%fZ")
                }
            )
        self._batch_markers = batch_markers

        # The remaining parameters are for backward compatibility.
        self._data_context = data_context
        self._datasource_name = datasource_name
        self._batch_parameters = batch_parameters
        self._batch_kwargs = batch_kwargs or BatchKwargs()
Example #21
0
def test_get_batch_definition_list_from_batch_request_length_one(
    basic_datasource,
):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    batch_identifiers: dict = {
        "airflow_run_id": 1234567890,
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset",
        "runtime_parameters": {"batch_data": test_df},
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    expected_batch_definition_list: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="my_data_asset",
            batch_identifiers=IDDict(batch_identifiers),
        )
    ]

    batch_definition_list: List[
        BatchDefinition
    ] = test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request
    )

    assert batch_definition_list == expected_batch_definition_list
Example #22
0
 def __init__(self, metric_name, metric_kwargs, metric_value):
     self._metric_name = metric_name
     if not isinstance(metric_kwargs, IDDict):
         metric_kwargs = IDDict(metric_kwargs)
     self._metric_kwargs = metric_kwargs
     self._metric_value = metric_value
Example #23
0
def get_batch_request_from_acceptable_arguments(
    datasource_name: Optional[str] = None,
    data_connector_name: Optional[str] = None,
    data_asset_name: Optional[str] = None,
    *,
    batch_request: Optional[BatchRequestBase] = None,
    batch_data: Optional[Any] = None,
    data_connector_query: Optional[dict] = None,
    batch_identifiers: Optional[dict] = None,
    limit: Optional[int] = None,
    index: Optional[Union[int, list, tuple, slice, str]] = None,
    custom_filter_function: Optional[Callable] = None,
    batch_spec_passthrough: Optional[dict] = None,
    sampling_method: Optional[str] = None,
    sampling_kwargs: Optional[dict] = None,
    splitter_method: Optional[str] = None,
    splitter_kwargs: Optional[dict] = None,
    runtime_parameters: Optional[dict] = None,
    query: Optional[str] = None,
    path: Optional[str] = None,
    batch_filter_parameters: Optional[dict] = None,
    **kwargs,
) -> Union[BatchRequest, RuntimeBatchRequest]:
    """Obtain formal BatchRequest typed object from allowed attributes (supplied as arguments).
    This method applies only to the new (V3) Datasource schema.

    Args:
        datasource_name
        data_connector_name
        data_asset_name

        batch_request
        batch_data
        query
        path
        runtime_parameters
        data_connector_query
        batch_identifiers
        batch_filter_parameters

        limit
        index
        custom_filter_function

        sampling_method
        sampling_kwargs

        splitter_method
        splitter_kwargs

        batch_spec_passthrough

        **kwargs

    Returns:
        (BatchRequest or RuntimeBatchRequest) The formal BatchRequest or RuntimeBatchRequest object
    """

    if batch_request:
        if not isinstance(batch_request, (BatchRequest, RuntimeBatchRequest)):
            raise TypeError(
                f"""batch_request must be an instance of BatchRequest or RuntimeBatchRequest object, not \
{type(batch_request)}""")
        datasource_name = batch_request.datasource_name

    # ensure that the first parameter is datasource_name, which should be a str. This check prevents users
    # from passing in batch_request as an unnamed parameter.
    if not isinstance(datasource_name, str):
        raise ge_exceptions.GreatExpectationsTypeError(
            f"the first parameter, datasource_name, must be a str, not {type(datasource_name)}"
        )

    if len([arg for arg in [batch_data, query, path] if arg is not None]) > 1:
        raise ValueError(
            "Must provide only one of batch_data, query, or path.")

    if any([
            batch_data is not None and runtime_parameters
            and "batch_data" in runtime_parameters,
            query and runtime_parameters and "query" in runtime_parameters,
            path and runtime_parameters and "path" in runtime_parameters,
    ]):
        raise ValueError(
            "If batch_data, query, or path arguments are provided, the same keys cannot appear in the "
            "runtime_parameters argument.")

    if batch_request:
        # TODO: Raise a warning if any parameters besides batch_requests are specified
        return batch_request

    batch_request_class: type
    batch_request_as_dict: dict

    if any([batch_data is not None, query, path, runtime_parameters]):
        batch_request_class = RuntimeBatchRequest

        runtime_parameters = runtime_parameters or {}
        if batch_data is not None:
            runtime_parameters["batch_data"] = batch_data
        elif query is not None:
            runtime_parameters["query"] = query
        elif path is not None:
            runtime_parameters["path"] = path

        if batch_identifiers is None:
            batch_identifiers = kwargs
        else:
            # Raise a warning if kwargs exist
            pass

        batch_request_as_dict = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": data_asset_name,
            "runtime_parameters": runtime_parameters,
            "batch_identifiers": batch_identifiers,
            "batch_spec_passthrough": batch_spec_passthrough,
        }
    else:
        batch_request_class = BatchRequest

        if data_connector_query is None:
            if batch_filter_parameters is not None and batch_identifiers is not None:
                raise ValueError(
                    'Must provide either "batch_filter_parameters" or "batch_identifiers", not both.'
                )

            if batch_filter_parameters is None and batch_identifiers is not None:
                logger.warning(
                    'Attempting to build data_connector_query but "batch_identifiers" was provided '
                    'instead of "batch_filter_parameters". The "batch_identifiers" key on '
                    'data_connector_query has been renamed to "batch_filter_parameters". Please update '
                    'your code. Falling back on provided "batch_identifiers".')
                batch_filter_parameters = batch_identifiers
            elif batch_filter_parameters is None and batch_identifiers is None:
                batch_filter_parameters = kwargs
            else:
                # Raise a warning if kwargs exist
                pass

            data_connector_query_params: dict = {
                "batch_filter_parameters": batch_filter_parameters,
                "limit": limit,
                "index": index,
                "custom_filter_function": custom_filter_function,
            }
            data_connector_query = IDDict(data_connector_query_params)
        else:
            # Raise a warning if batch_filter_parameters or kwargs exist
            data_connector_query = IDDict(data_connector_query)

        if batch_spec_passthrough is None:
            batch_spec_passthrough = {}
            if sampling_method is not None:
                sampling_params: dict = {
                    "sampling_method": sampling_method,
                }
                if sampling_kwargs is not None:
                    sampling_params["sampling_kwargs"] = sampling_kwargs
                batch_spec_passthrough.update(sampling_params)
            if splitter_method is not None:
                splitter_params: dict = {
                    "splitter_method": splitter_method,
                }
                if splitter_kwargs is not None:
                    splitter_params["splitter_kwargs"] = splitter_kwargs
                batch_spec_passthrough.update(splitter_params)

        batch_request_as_dict: dict = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": data_asset_name,
            "data_connector_query": data_connector_query,
            "batch_spec_passthrough": batch_spec_passthrough,
        }

    deep_filter_properties_iterable(
        properties=batch_request_as_dict,
        inplace=True,
    )

    batch_request = batch_request_class(**batch_request_as_dict)

    return batch_request
Example #24
0
 def id(self) -> str:
     return IDDict(self.to_json_dict()).to_id()
Example #25
0
def test_data_references_cache_updating_after_batch_request_named_assets(
    basic_datasource_with_assets, ):
    runtime_data_connector: RuntimeDataConnector = (
        basic_datasource_with_assets.data_connectors["runtime"])

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # if data_connector contains the assets in configuration
    assert runtime_data_connector.get_available_data_asset_names() == [
        "asset_a",
        "asset_b",
    ]

    batch_identifiers: dict = {"day": 1, "month": 1}
    batch_request: dict = {
        "datasource_name": basic_datasource_with_assets.name,
        "data_connector_name": runtime_data_connector.name,
        "data_asset_name": "asset_a",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": batch_identifiers,
    }

    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with my_data_asset_1
    batch_definitions: List[
        BatchDefinition] = runtime_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=batch_request)
    assert batch_definitions == [
        BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="runtime",
            data_asset_name="asset_a",
            batch_identifiers=IDDict({
                "month": 1,
                "day": 1
            }),
        )
    ]
    assert runtime_data_connector._data_references_cache == {
        "asset_a": {
            "1-1": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="runtime",
                    data_asset_name="asset_a",
                    batch_identifiers=IDDict({
                        "day": 1,
                        "month": 1
                    }),
                )
            ],
        }
    }
    batch_identifiers: dict = {"day": 1, "month": 2}
    batch_request: dict = {
        "datasource_name": basic_datasource_with_assets.name,
        "data_connector_name": runtime_data_connector.name,
        "data_asset_name": "asset_a",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with another batch under asset_a
    batch_definitions: List[
        BatchDefinition] = runtime_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=batch_request)
    assert batch_definitions == [
        BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="runtime",
            data_asset_name="asset_a",
            batch_identifiers=IDDict({
                "month": 2,
                "day": 1
            }),
        ),
    ]
    assert runtime_data_connector._data_references_cache == {
        "asset_a": {
            "1-1": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="runtime",
                    data_asset_name="asset_a",
                    batch_identifiers=IDDict({
                        "day": 1,
                        "month": 1
                    }),
                )
            ],
            "1-2": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="runtime",
                    data_asset_name="asset_a",
                    batch_identifiers=IDDict({
                        "day": 1,
                        "month": 2
                    }),
                )
            ],
        }
    }
Example #26
0
def test_data_references_cache_updating_after_batch_request(
    basic_datasource, ):
    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # empty if data_connector has not been used
    assert test_runtime_data_connector.get_available_data_asset_names() == []

    batch_identifiers = {
        "airflow_run_id": 1234567890,
    }

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_1",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with my_data_asset_1
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request)

    assert test_runtime_data_connector._data_references_cache == {
        "my_data_asset_1": {
            "1234567890": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    batch_identifiers=IDDict({"airflow_run_id": 1234567890}),
                )
            ],
        }
    }

    # update with
    test_df_new: pd.DataFrame = pd.DataFrame(data={
        "col1": [5, 6],
        "col2": [7, 8]
    })
    batch_identifiers = {
        "airflow_run_id": 987654321,
    }

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_1",
        "runtime_parameters": {
            "batch_data": test_df_new,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with with new_data_asset but a new batch
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request)

    assert test_runtime_data_connector._data_references_cache == {
        "my_data_asset_1": {
            "1234567890": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    batch_identifiers=IDDict({"airflow_run_id": 1234567890}),
                )
            ],
            "987654321": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    batch_identifiers=IDDict({"airflow_run_id": 987654321}),
                )
            ],
        },
    }

    # new data_asset_name
    test_df_new_asset: pd.DataFrame = pd.DataFrame(data={
        "col1": [9, 10],
        "col2": [11, 12]
    })
    batch_identifiers = {
        "airflow_run_id": 5555555,
    }

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_2",
        "runtime_parameters": {
            "batch_data": test_df_new_asset,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with with new_data_asset but a new batch
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request)

    assert test_runtime_data_connector._data_references_cache == {
        "my_data_asset_1": {
            "1234567890": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    batch_identifiers=IDDict({"airflow_run_id": 1234567890}),
                )
            ],
            "987654321": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_1",
                    batch_identifiers=IDDict({"airflow_run_id": 987654321}),
                )
            ],
        },
        "my_data_asset_2": {
            "5555555": [
                BatchDefinition(
                    datasource_name="my_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset_2",
                    batch_identifiers=IDDict({"airflow_run_id": 5555555}),
                )
            ]
        },
    }

    assert test_runtime_data_connector.get_available_data_asset_names() == [
        "my_data_asset_1",
        "my_data_asset_2",
    ]

    assert test_runtime_data_connector.get_data_reference_list_count() == 3