Example #1
0
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector(
    test_s3_files, test_df_small
):
    bucket, _keys = test_s3_files
    expected_df = test_df_small

    execution_engine: ExecutionEngine = PandasExecutionEngine()

    my_data_connector = ConfiguredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        bucket=bucket,
        execution_engine=execution_engine,
        prefix="",
        assets={"alpha": {}},
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
    )
    batch_def: BatchDefinition = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=1),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    test_df = execution_engine.get_batch_data(
        batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def)
    )
    assert test_df.dataframe.shape == expected_df.shape

    # if key does not exist
    batch_def_no_key = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=9),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine.get_batch_data(
            batch_spec=my_data_connector.build_batch_spec(
                batch_definition=batch_def_no_key
            )
        )
 def from_object(cls, validation_result):
     batch_kwargs = validation_result.meta.get("batch_kwargs", {})
     if isinstance(batch_kwargs, IDDict):
         batch_identifier = batch_kwargs.to_id()
     elif isinstance(batch_kwargs, dict):
         batch_identifier = IDDict(batch_kwargs).to_id()
     else:
         raise DataContextError("Unable to construct ValidationResultIdentifier from provided object.")
     return cls(
         expectation_suite_identifier=ExpectationSuiteIdentifier(validation_result.meta["expectation_suite_name"]),
         run_id=validation_result.meta.get("run_id"),
         batch_identifier=batch_identifier
     )
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph,
                MetricConfiguration("column_values.not_a_metric", IDDict()),
                configuration,
                execution_engine=engine,
            )
        except MetricProviderError as e:
            graph = e

    assert isinstance(graph, MetricProviderError)
Example #4
0
 def id(self) -> str:
     return IDDict(self.to_json_dict()).to_id()
Example #5
0
    def resolve_metric_bundle(
        self,
        metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Any, dict, dict]],
    ) -> Dict[Tuple[str, str, str], Any]:
        """For every metric in a set of Metrics to resolve, obtains necessary metric keyword arguments and builds
        bundles of the metrics into one large query dictionary so that they are all executed simultaneously. Will fail
        if bundling the metrics together is not possible.

            Args:
                metric_fn_bundle (Iterable[Tuple[MetricConfiguration, Callable, dict]): \
                    A Dictionary containing a MetricProvider's MetricConfiguration (its unique identifier), its metric provider function
                    (the function that actually executes the metric), and the arguments to pass to the metric provider function.
                    A dictionary of metrics defined in the registry and corresponding arguments

            Returns:
                A dictionary of metric names and their corresponding now-queried values.
        """
        resolved_metrics = {}

        # We need a different query for each domain (where clause).
        queries: Dict[Tuple, dict] = {}
        for (
            metric_to_resolve,
            engine_fn,
            compute_domain_kwargs,
            accessor_domain_kwargs,
            metric_provider_kwargs,
        ) in metric_fn_bundle:
            if not isinstance(compute_domain_kwargs, IDDict):
                compute_domain_kwargs = IDDict(compute_domain_kwargs)
            domain_id = compute_domain_kwargs.to_id()
            if domain_id not in queries:
                queries[domain_id] = {
                    "select": [],
                    "ids": [],
                    "domain_kwargs": compute_domain_kwargs,
                }
            if self.engine.dialect.name == "clickhouse":
                queries[domain_id]["select"].append(
                    engine_fn.label(
                        metric_to_resolve.metric_name.join(
                            random.choices(string.ascii_lowercase, k=2)
                        )
                    )
                )
            else:
                queries[domain_id]["select"].append(
                    engine_fn.label(metric_to_resolve.metric_name)
                )
            queries[domain_id]["ids"].append(metric_to_resolve.id)
        for query in queries.values():
            domain_kwargs = query["domain_kwargs"]
            selectable = self.get_domain_records(
                domain_kwargs=domain_kwargs,
            )
            assert len(query["select"]) == len(query["ids"])
            try:
                """
                If a custom query is passed, selectable will be TextClause and not formatted
                as a subquery wrapped in "(subquery) alias". TextClause must first be converted
                to TextualSelect using sa.columns() before it can be converted to type Subquery
                """
                if TextClause and isinstance(selectable, TextClause):
                    res = self.engine.execute(
                        sa.select(query["select"]).select_from(
                            selectable.columns().subquery()
                        )
                    ).fetchall()
                else:
                    res = self.engine.execute(
                        sa.select(query["select"]).select_from(selectable)
                    ).fetchall()
                logger.debug(
                    f"SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(domain_kwargs).to_id()}"
                )
            except OperationalError as oe:
                exception_message: str = "An SQL execution Exception occurred.  "
                exception_traceback: str = traceback.format_exc()
                exception_message += f'{type(oe).__name__}: "{str(oe)}".  Traceback: "{exception_traceback}".'
                logger.error(exception_message)
                raise ExecutionEngineError(message=exception_message)
            assert (
                len(res) == 1
            ), "all bundle-computed metrics must be single-value statistics"
            assert len(query["ids"]) == len(
                res[0]
            ), "unexpected number of metrics returned"
            for idx, id in enumerate(query["ids"]):
                resolved_metrics[id] = convert_to_json_serializable(res[0][idx])

        return resolved_metrics
Example #6
0
    def resolve_metric_bundle(
        self, metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Any, dict, dict]],
    ) -> dict:
        """For every metrics in a set of Metrics to resolve, obtains necessary metric keyword arguments and builds a
        bundles the metrics into one large query dictionary so that they are all executed simultaneously. Will fail if
        bundling the metrics together is not possible.

            Args:
                metric_fn_bundle (Iterable[Tuple[MetricConfiguration, Callable, dict]): \
                    A Dictionary containing a MetricProvider's MetricConfiguration (its unique identifier), its metric provider function
                    (the function that actually executes the metric), and the arguments to pass to the metric provider function.
                metrics (Dict[Tuple, Any]): \
                    A dictionary of metrics defined in the registry and corresponding arguments

            Returns:
                A dictionary of metric names and their corresponding now-queried values.
        """
        resolved_metrics = dict()

        # We need a different query for each domain (where clause).
        queries: Dict[Tuple, dict] = dict()
        for (
            metric_to_resolve,
            engine_fn,
            compute_domain_kwargs,
            accessor_domain_kwargs,
            metric_provider_kwargs,
        ) in metric_fn_bundle:
            if not isinstance(compute_domain_kwargs, IDDict):
                compute_domain_kwargs = IDDict(compute_domain_kwargs)
            domain_id = compute_domain_kwargs.to_id()
            if domain_id not in queries:
                queries[domain_id] = {
                    "select": [],
                    "ids": [],
                    "domain_kwargs": compute_domain_kwargs,
                }
            queries[domain_id]["select"].append(
                engine_fn.label(metric_to_resolve.metric_name)
            )
            queries[domain_id]["ids"].append(metric_to_resolve.id)
        for query in queries.values():
            selectable, compute_domain_kwargs, _ = self.get_compute_domain(
                query["domain_kwargs"], domain_type="identity"
            )
            assert len(query["select"]) == len(query["ids"])
            res = self.engine.execute(
                sa.select(query["select"]).select_from(selectable)
            ).fetchall()
            logger.debug(
                f"SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(compute_domain_kwargs).to_id()}"
            )
            assert (
                len(res) == 1
            ), "all bundle-computed metrics must be single-value statistics"
            assert len(query["ids"]) == len(
                res[0]
            ), "unexpected number of metrics returned"
            for idx, id in enumerate(query["ids"]):
                resolved_metrics[id] = convert_to_json_serializable(res[0][idx])

        # Convert metrics to be serializable
        return resolved_metrics
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
        )
        batch_definition_list: List[
            BatchDefinition] = data_connector.get_batch_definition_list_from_batch_request(
                batch_request)

        assert len(
            batch_definition_list) == test_case.num_expected_batch_definitions

        expected_batch_definition_list: List[BatchDefinition] = [
            BatchDefinition(
                datasource_name=datasource_name,
                data_connector_name=data_connector_name,
                data_asset_name=data_asset_name,
                batch_identifiers=IDDict({column_name: pickup_datetime}),
            ) for pickup_datetime in test_case.expected_pickup_datetimes
        ]

        assert set(batch_definition_list) == set(
            expected_batch_definition_list
        ), f"BatchDefinition lists don't match\n\nbatch_definition_list:\n{batch_definition_list}\n\nexpected_batch_definition_list:\n{expected_batch_definition_list}"

        # 4. Check that loaded data is as expected

        # Use expected_batch_definition_list since it is sorted, and we already
        # asserted that it contains the same items as batch_definition_list
        batch_spec: SqlAlchemyDatasourceBatchSpec = data_connector.build_batch_spec(
            expected_batch_definition_list[0])

        batch_data: SqlAlchemyBatchData = context.datasources[
Example #8
0
def test_return_all_batch_definitions_returns_specified_partition(
        mock_gcs_conn, mock_list_keys, mock_emit,
        empty_data_context_stats_enabled):
    my_data_connector_yaml = yaml.load(
        f"""
       class_name: ConfiguredAssetGCSDataConnector
       datasource_name: test_environment
       bucket_or_name: my_bucket
       prefix: ""
       assets:
           TestFiles:
       default_regex:
           pattern: (.+)_(.+)_(.+)\\.csv
           group_names:
               - name
               - timestamp
               - price
       sorters:
           - orderby: asc
             class_name: LexicographicSorter
             name: name
           - datetime_format: "%Y%m%d"
             orderby: desc
             class_name: DateTimeSorter
             name: timestamp
           - orderby: desc
             class_name: NumericSorter
             name: price
     """, )

    mock_list_keys.return_value = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]

    my_data_connector: ConfiguredAssetGCSDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_gcs_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )

    self_check_report = my_data_connector.self_check()

    assert self_check_report["class_name"] == "ConfiguredAssetGCSDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert self_check_report["data_assets"]["TestFiles"][
        "batch_definition_count"] == 10
    assert self_check_report["unmatched_data_reference_count"] == 0

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_gcs_data_connector",
        data_asset_name="TestFiles",
        data_connector_query=IDDict(
            **{
                "batch_filter_parameters": {
                    "name": "james",
                    "timestamp": "20200713",
                    "price": "1567",
                }
            }),
    )

    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))

    assert len(my_batch_definition_list) == 1
    my_batch_definition = my_batch_definition_list[0]
    expected_batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_gcs_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict(**{
            "name": "james",
            "timestamp": "20200713",
            "price": "1567",
        }),
    )
    assert my_batch_definition == expected_batch_definition
Example #9
0
def expected_batch_definitions_unsorted():
    """
    Used to validate `get_batch_definition_list_from_batch_request()` outputs.
    Input and output should maintain the same order (henced "unsorted")
    """
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200811",
                "price": "1009"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200713",
                "price": "1567"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200810",
                "price": "1001"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200810",
                "price": "1003"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200819",
                "price": "1300"
            }),
        ),
    ]
    return expected
Example #10
0
def expected_batch_definitions_sorted():
    """
    Used to validate `get_batch_definition_list_from_batch_request()` outputs.
    Input should be sorted based on some criteria, resulting in some change
    between input and output.
    """
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200819",
                "price": "1300"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200811",
                "price": "1009"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200810",
                "price": "1003"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200713",
                "price": "1567"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200810",
                "price": "1001"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_gcs_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
    ]
    return expected