def from_object(cls, metric): if not isinstance(metric, Metric): raise GreatExpectationsError( "Unable to build MetricIdentifier from object of type {} when Metric is " "expected.".format(type(metric)) ) return cls(metric.metric_name, metric.metric_kwargs_id)
def add_column_row_condition( self, domain_kwargs, column_name=None, filter_null=True, filter_nan=False ): # We explicitly handle filter_nan & filter_null for spark using a spark-native condition if "row_condition" in domain_kwargs and domain_kwargs["row_condition"]: raise GreatExpectationsError( "ExecutionEngine does not support updating existing row_conditions." ) new_domain_kwargs = copy.deepcopy(domain_kwargs) assert "column" in domain_kwargs or column_name is not None if column_name is not None: column = column_name else: column = domain_kwargs["column"] if filter_null and filter_nan: new_domain_kwargs[ "row_condition" ] = f"NOT isnan({column}) AND {column} IS NOT NULL" elif filter_null: new_domain_kwargs["row_condition"] = f"{column} IS NOT NULL" elif filter_nan: new_domain_kwargs["row_condition"] = f"NOT isnan({column})" else: logger.warning( "add_column_row_condition called without specifying a desired row condition" ) new_domain_kwargs["condition_parser"] = "spark" return new_domain_kwargs
def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, reader_options=None, limit=None): try: asset_config = self._assets[generator_asset] except KeyError: raise GreatExpectationsError("No asset config found for asset %s" % generator_asset) if generator_asset not in self._iterators: self._iterators[generator_asset] = {} iterator_dict = self._iterators[generator_asset] batch_kwargs = None for key in self._get_asset_options(generator_asset, iterator_dict): if self._partitioner(key=key, asset_config=asset_config) == partition_id: batch_kwargs = self._build_batch_kwargs( key=key, asset_config=asset_config, reader_options=reader_options, limit=limit) if batch_kwargs is None: raise BatchKwargsError( "Unable to identify partition %s for asset %s" % (partition_id, generator_asset), { generator_asset: generator_asset, partition_id: partition_id }) return batch_kwargs
def __init__(self, name="default", datasource=None, assets=None): super().__init__(name=name, datasource=datasource) if not assets: assets = {} try: self._assets = { asset_name: assetConfigurationSchema.load(asset_config) for (asset_name, asset_config) in assets.items() } except ValidationError as err: raise GreatExpectationsError( "Unable to load asset configuration in TableBatchKwargsGenerator '%s': " "validation error: %s." % (name, str(err)) ) if datasource is not None: self.engine = datasource.engine try: self.inspector = sqlalchemy.inspect(self.engine) except sqlalchemy.exc.OperationalError: logger.warning( "Unable to create inspector from engine in batch kwargs generator '%s'" % name ) self.inspector = None
def build_batch_kwargs_from_partition_id(self, generator_asset, partition_id=None, batch_kwargs=None, **kwargs): try: asset_config = self._assets[generator_asset] except KeyError: raise GreatExpectationsError("No asset config found for asset %s" % generator_asset) if generator_asset not in self._iterators: self._iterators[generator_asset] = {} iterator_dict = self._iterators[generator_asset] new_kwargs = None for key in self._get_asset_options(generator_asset, iterator_dict): if self._partitioner(key=key, asset_config=asset_config) == partition_id: new_kwargs = self._build_batch_kwargs( key=key, asset_config=asset_config) if new_kwargs is None: raise BatchKwargsError( "Unable to identify partition %s for asset %s" % (partition_id, generator_asset), { generator_asset: generator_asset, partition_id: partition_id }) if batch_kwargs is not None: kwargs.update(batch_kwargs) if kwargs is not None: new_kwargs.update(kwargs) return new_kwargs
def load_batch_data(self, batch_id: str, batch_data: Any) -> None: if isinstance(batch_data, DataFrame): batch_data = SparkDFBatchData(self, batch_data) elif isinstance(batch_data, SparkDFBatchData): pass else: raise GreatExpectationsError( "SparkDFExecutionEngine requires batch data that is either a DataFrame or a SparkDFBatchData object" ) super().load_batch_data(batch_id=batch_id, batch_data=batch_data)
def singularize(plural_ge_noun): """ Singularizes a Great Expectations plural noun """ try: return PLURAL_TO_SINGULAR_LOOKUP_DICT[plural_ge_noun.lower()] except KeyError: raise GreatExpectationsError( f"Unable to singularize '{plural_ge_noun}'. Please update " f"great_expectations.util.PLURAL_TO_SINGULAR_LOOKUP_DICT.")
def pluralize(singular_ge_noun): """ Pluralizes a Great Expectations singular noun """ try: return SINGULAR_TO_PLURAL_LOOKUP_DICT[singular_ge_noun.lower()] except KeyError: raise GreatExpectationsError( f"Unable to pluralize '{singular_ge_noun}'. Please update " f"great_expectations.util.SINGULAR_TO_PLURAL_LOOKUP_DICT")
def from_object(cls, validation_metric): if not isinstance(validation_metric, ValidationMetric): raise GreatExpectationsError( "Unable to build ValidationMetricIdentifier from object of type {} when " "ValidationMetric is expected.".format( type(validation_metric))) return cls(validation_metric.expectation_suite_identifier, validation_metric.run_id, validation_metric.metric_name, validation_metric.metric_kwargs_id)
def from_fixed_length_tuple(cls, tuple_): if len(tuple_) != 4: raise GreatExpectationsError( "ValidationMetricIdentifier fixed length tuple must have exactly four " "components.") return cls(run_id=tuple_[0], expectation_suite_identifier=ExpectationSuiteIdentifier. from_fixed_length_tuple(tuple((tuple_[1], ))), metric_name=tuple_[2], metric_kwargs_id=tuple_[3])
def _split_multi_column_metric_domain_kwargs( self, domain_kwargs: Dict, domain_type: MetricDomainTypes, ) -> SplitDomainKwargs: """Split domain_kwargs for multicolumn domain types into compute and accessor domain kwargs. Args: domain_kwargs: A dictionary consisting of the domain kwargs specifying which data to obtain domain_type: an Enum value indicating which metric domain the user would like to be using. Returns: compute_domain_kwargs, accessor_domain_kwargs split from domain_kwargs The union of compute_domain_kwargs, accessor_domain_kwargs is the input domain_kwargs """ assert (domain_type == MetricDomainTypes.MULTICOLUMN ), "This method only supports MetricDomainTypes.MULTICOLUMN" compute_domain_kwargs: Dict = copy.deepcopy(domain_kwargs) accessor_domain_kwargs: Dict = {} if "column_list" not in domain_kwargs: raise GreatExpectationsError( "column_list not found within domain_kwargs") column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise GreatExpectationsError( "column_list must contain at least 2 columns") # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column_list"] = [ quoted_name(column_name, quote=True) for column_name in column_list ] else: accessor_domain_kwargs["column_list"] = column_list return SplitDomainKwargs(compute_domain_kwargs, accessor_domain_kwargs)
def from_tuple(cls, tuple_): if len(tuple_) < 4: raise GreatExpectationsError( "ValidationMetricIdentifier tuple must have at least four components." ) return cls( run_id=tuple_[0], expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple( tuple_[1:-2]), metric_name=tuple_[-2], metric_kwargs_id=tuple_[-1])
def add_column_row_condition(self, domain_kwargs, column_name=None, filter_null=True, filter_nan=False): """EXPERIMENTAL Add a row condition for handling null filter. Args: domain_kwargs: the domain kwargs to use as the base and to which to add the condition column_name: if provided, use this name to add the condition; otherwise, will use "column" key from table_domain_kwargs filter_null: if true, add a filter for null values filter_nan: if true, add a filter for nan values """ if filter_null is False and filter_nan is False: logger.warning( "add_column_row_condition called with no filter condition requested" ) return domain_kwargs if filter_nan: raise GreatExpectationsError( "Base ExecutionEngine does not support adding nan condition filters" ) if "row_condition" in domain_kwargs and domain_kwargs["row_condition"]: raise GreatExpectationsError( "ExecutionEngine does not support updating existing row_conditions." ) new_domain_kwargs = copy.deepcopy(domain_kwargs) assert "column" in domain_kwargs or column_name is not None if column_name is not None: column = column_name else: column = domain_kwargs["column"] new_domain_kwargs[ "condition_parser"] = "great_expectations__experimental__" new_domain_kwargs["row_condition"] = f'col("{column}").notnull()' return new_domain_kwargs
def _build_batch_kwargs(self, batch_parameters): try: data_asset_name = batch_parameters.pop("data_asset_name") except KeyError: raise BatchKwargsError( "Unable to build BatchKwargs: no name provided in batch_parameters.", batch_kwargs=batch_parameters, ) partition_id = batch_parameters.pop("partition_id", None) batch_kwargs = self._datasource.process_batch_parameters( batch_parameters) if partition_id: try: asset_config = self._assets[data_asset_name] except KeyError: raise GreatExpectationsError( "No asset config found for asset %s" % data_asset_name) if data_asset_name not in self._iterators: self._iterators[data_asset_name] = {} iterator_dict = self._iterators[data_asset_name] for key in self._get_asset_options(asset_config, iterator_dict): if (self._partitioner( key=key, asset_config=asset_config) == partition_id): batch_kwargs = self._build_batch_kwargs_from_key( key=key, asset_config=asset_config, reader_options=batch_parameters.get( "reader_options"), # handled in generator limit=batch_kwargs.get( "limit" ), # may have been processed from datasource ) if batch_kwargs is None: raise BatchKwargsError( "Unable to identify partition %s for asset %s" % (partition_id, data_asset_name), { data_asset_name: data_asset_name, partition_id: partition_id }, ) return batch_kwargs else: return self.yield_batch_kwargs(data_asset_name=data_asset_name, **batch_parameters, **batch_kwargs)
def _build_evr(self, raw_response, configuration): """_build_evr is a lightweight convenience wrapper handling cases where an Expectation implementor fails to return an EVR but returns the necessary components in a dictionary.""" if not isinstance(raw_response, ExpectationValidationResult): if isinstance(raw_response, dict): evr = ExpectationValidationResult(**raw_response) evr.expectation_config = configuration else: raise GreatExpectationsError("Unable to build EVR") else: evr = raw_response evr.expectation_config = configuration return evr
def profile(cls, data_asset, run_id=None): if not cls.validate(data_asset): raise GreatExpectationsError( "Invalid data_asset for profiler; aborting") expectation_suite = cls._profile(data_asset) batch_kwargs = data_asset.batch_kwargs expectation_suite = cls.add_meta(expectation_suite, batch_kwargs) validation_results = data_asset.validate(expectation_suite, run_id=run_id, result_format="SUMMARY") return expectation_suite, validation_results
def profile( cls, data_asset, run_id=None, profiler_configuration=None, run_name=None, run_time=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: # deprecated-v0.11.0 warnings.warn( "String run_ids are deprecated as of v0.11.0 and support will be removed in v0.16. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_name = run_name or "profiling" run_id = RunIdentifier(run_name=run_name, run_time=run_time) if not cls.validate(data_asset): raise GreatExpectationsError( "Invalid data_asset for profiler; aborting") expectation_suite = cls._profile(data_asset, configuration=profiler_configuration) batch_kwargs = data_asset.batch_kwargs expectation_suite = cls.add_meta(expectation_suite, batch_kwargs) validation_results = data_asset.validate(expectation_suite, run_id=run_id, result_format="SUMMARY") expectation_suite.add_citation( comment= f"{str(cls.__name__)} added a citation based on the current batch.", batch_kwargs=data_asset.batch_kwargs, batch_markers=data_asset.batch_markers, batch_parameters=data_asset.batch_parameters, ) return expectation_suite, validation_results
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): batch_id = metric_domain_kwargs.get("batch_id") if batch_id is None: if execution_engine.active_batch_data_id is not None: batch_id = execution_engine.active_batch_data_id else: raise GreatExpectationsError( "batch_id could not be determined from domain kwargs and no active_batch_data is loaded into the " "execution engine") batch_data = execution_engine.loaded_batch_data_dict.get(batch_id) if batch_data is None: raise GreatExpectationsError( "the requested batch is not available; please load the batch into the execution engine." ) return _get_sqlalchemy_column_metadata(execution_engine.engine, batch_data)
def build_batch_kwargs_from_partition_id( self, generator_asset=None, data_asset_name=None, partition_id=None, reader_options=None, limit=None, ): assert (generator_asset and not data_asset_name) or ( not generator_asset and data_asset_name ), "Please provide either generator_asset or data_asset_name." if generator_asset: warnings.warn( "The 'generator_asset' argument will be deprecated and renamed to 'data_asset_name'. " "Please update code accordingly.", DeprecationWarning, ) data_asset_name = generator_asset try: asset_config = self._assets[data_asset_name] except KeyError: raise GreatExpectationsError("No asset config found for asset %s" % data_asset_name) if data_asset_name not in self._iterators: self._iterators[data_asset_name] = {} iterator_dict = self._iterators[data_asset_name] batch_kwargs = None for key in self._get_asset_options(data_asset_name, iterator_dict): if self._partitioner(key=key, asset_config=asset_config) == partition_id: batch_kwargs = self._build_batch_kwargs( key=key, asset_config=asset_config, reader_options=reader_options, limit=limit, ) if batch_kwargs is None: raise BatchKwargsError( "Unable to identify partition %s for asset %s" % (partition_id, data_asset_name), { data_asset_name: data_asset_name, partition_id: partition_id }, ) return batch_kwargs
def from_tuple(cls, tuple_): if len(tuple_) < 6: raise GreatExpectationsError( "ValidationMetricIdentifier tuple must have at least six components." ) if tuple_[2] == "__": tuple_data_asset_name = None else: tuple_data_asset_name = tuple_[2] metric_id = MetricIdentifier.from_tuple(tuple_[-2:]) return cls( run_id=RunIdentifier.from_tuple((tuple_[0], tuple_[1])), data_asset_name=tuple_data_asset_name, expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple( tuple_[3:-2]), metric_name=metric_id.metric_name, metric_kwargs_id=metric_id.metric_kwargs_id, )
def inner_func(*args, **kwargs): rendered_string_template = render_func(*args, **kwargs) current_expectation_params = list() app_template_str = ( "\n - $eval_param = $eval_param_value (at time of validation)." ) configuration = kwargs.get("configuration", None) kwargs_dict = configuration.kwargs for key, value in kwargs_dict.items(): if isinstance(value, dict) and "$PARAMETER" in value.keys(): current_expectation_params.append(value["$PARAMETER"]) # if expectation configuration has no eval params, then don't look for the values in runtime_configuration if len(current_expectation_params) > 0: runtime_configuration = kwargs.get("runtime_configuration", None) if runtime_configuration: eval_params = runtime_configuration.get("evaluation_parameters", {}) styling = runtime_configuration.get("styling") for key, val in eval_params.items(): # this needs to be more complicated? # the possibility that it is a substring? for param in current_expectation_params: # "key in param" condition allows for eval param values to be rendered if arithmetic is present if key == param or key in param: app_params = {} app_params["eval_param"] = key app_params["eval_param_value"] = val to_append = RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": app_template_str, "params": app_params, "styling": styling, }, } ) rendered_string_template.append(to_append) else: raise GreatExpectationsError( f"""GE was not able to render the value of evaluation parameters. Expectation {render_func} had evaluation parameters set, but they were not passed in.""" ) return rendered_string_template
def from_fixed_length_tuple(cls, tuple_): if len(tuple_) != 6: raise GreatExpectationsError( "ValidationMetricIdentifier fixed length tuple must have exactly six " "components.") if tuple_[2] == "__": tuple_data_asset_name = None else: tuple_data_asset_name = tuple_[2] metric_id = MetricIdentifier.from_tuple(tuple_[-2:]) return cls( run_id=RunIdentifier.from_fixed_length_tuple( (tuple_[0], tuple_[1])), data_asset_name=tuple_data_asset_name, expectation_suite_identifier=ExpectationSuiteIdentifier. from_fixed_length_tuple(tuple((tuple_[3], ))), metric_name=metric_id.metric_name, metric_kwargs_id=metric_id.metric_kwargs_id, )
def profile(cls, data_asset, run_id=None): if not cls.validate(data_asset): raise GreatExpectationsError( "Invalid data_asset for profiler; aborting") expectation_suite = cls._profile(data_asset) batch_kwargs = data_asset.batch_kwargs expectation_suite = cls.add_meta(expectation_suite, batch_kwargs) validation_results = data_asset.validate(expectation_suite, run_id=run_id, result_format="SUMMARY") expectation_suite.add_citation( comment=str(cls.__name__) + " added a citation based on the current batch.", batch_kwargs=data_asset.batch_kwargs, batch_markers=data_asset.batch_markers, batch_parameters=data_asset.batch_parameters) return expectation_suite, validation_results
def get_compute_domain( self, domain_kwargs: dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple[DataFrame, dict, dict]: """Uses a given batch dictionary and domain kwargs (which include a row condition and a condition parser) to obtain and/or query a batch. Returns in the format of a Spark DataFrame. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: A tuple including: - a DataFrame (the data on which to compute) - a dictionary of compute_domain_kwargs, describing the DataFrame - a dictionary of accessor_domain_kwargs, describing any accessors needed to identify the domain within the compute domain """ data = self.get_domain_records( domain_kwargs=domain_kwargs, ) # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = {} table = domain_kwargs.get("table", None) if table: raise ValueError( "SparkDFExecutionEngine does not currently support multiple named tables." ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if ( domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(list(accessor_keys)) > 0 ): logger.warning( 'Accessor keys ignored since Metric Domain Type is not "table"' ) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(list(accessor_keys)) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key) if len(compute_domain_kwargs.keys()) > 0: # Warn user if kwarg not "normal". unexpected_keys: set = set(compute_domain_kwargs.keys()).difference( { "batch_id", "table", "row_condition", "condition_parser", } ) if len(unexpected_keys) > 0: unexpected_keys_str: str = ", ".join( map(lambda element: f'"{element}"', unexpected_keys) ) logger.warning( f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".' ) return data, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN: if "column" not in compute_domain_kwargs: raise GreatExpectationsError( "Column not provided in compute_domain_kwargs" ) accessor_domain_kwargs["column"] = compute_domain_kwargs.pop("column") elif domain_type == MetricDomainTypes.COLUMN_PAIR: if not ( "column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs ): raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop("column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop("column_B") elif domain_type == MetricDomainTypes.MULTICOLUMN: if "column_list" not in domain_kwargs: raise ge_exceptions.GreatExpectationsError( "column_list not found within domain_kwargs" ) column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise ge_exceptions.GreatExpectationsError( "column_list must contain at least 2 columns" ) accessor_domain_kwargs["column_list"] = column_list return data, compute_domain_kwargs, accessor_domain_kwargs
def resolve_metrics( self, metrics_to_resolve: Iterable[MetricConfiguration], metrics: Dict[Tuple, Any] = None, runtime_configuration: dict = None, ) -> dict: """resolve_metrics is the main entrypoint for an execution engine. The execution engine will compute the value of the provided metrics. Args: metrics_to_resolve: the metrics to evaluate metrics: already-computed metrics currently available to the engine runtime_configuration: runtime configuration information Returns: resolved_metrics (Dict): a dictionary with the values for the metrics that have just been resolved. """ if metrics is None: metrics = dict() resolved_metrics = dict() metric_fn_bundle = [] for metric_to_resolve in metrics_to_resolve: metric_class, metric_fn = get_metric_provider( metric_name=metric_to_resolve.metric_name, execution_engine=self) try: metric_dependencies = { k: metrics[v.id] for k, v in metric_to_resolve.metric_dependencies.items() } except KeyError as e: raise GreatExpectationsError( f"Missing metric dependency: {str(e)}") metric_provider_kwargs = { "cls": metric_class, "execution_engine": self, "metric_domain_kwargs": metric_to_resolve.metric_domain_kwargs, "metric_value_kwargs": metric_to_resolve.metric_value_kwargs, "metrics": metric_dependencies, "runtime_configuration": runtime_configuration, } if metric_fn is None: try: ( metric_fn, compute_domain_kwargs, accessor_domain_kwargs, ) = metric_dependencies.pop("metric_partial_fn") except KeyError as e: raise GreatExpectationsError( f"Missing metric dependency: {str(e)} for metric {metric_to_resolve.metric_name}" ) metric_fn_bundle.append(( metric_to_resolve, metric_fn, compute_domain_kwargs, accessor_domain_kwargs, metric_provider_kwargs, )) continue metric_fn_type = getattr(metric_fn, "metric_fn_type", MetricFunctionTypes.VALUE) if metric_fn_type in [ MetricPartialFunctionTypes.MAP_SERIES, MetricPartialFunctionTypes.MAP_FN, MetricPartialFunctionTypes.MAP_CONDITION_FN, MetricPartialFunctionTypes.MAP_CONDITION_SERIES, MetricPartialFunctionTypes.WINDOW_FN, MetricPartialFunctionTypes.WINDOW_CONDITION_FN, MetricPartialFunctionTypes.AGGREGATE_FN, ]: # NOTE: 20201026 - JPC - we could use the fact that these metric functions return functions rather # than data to optimize compute in the future resolved_metrics[metric_to_resolve.id] = metric_fn( **metric_provider_kwargs) elif metric_fn_type == MetricFunctionTypes.VALUE: resolved_metrics[metric_to_resolve.id] = metric_fn( **metric_provider_kwargs) else: logger.warning( f"Unrecognized metric function type while trying to resolve {str(metric_to_resolve.id)}" ) resolved_metrics[metric_to_resolve.id] = metric_fn( **metric_provider_kwargs) if len(metric_fn_bundle) > 0: resolved_metrics.update( self.resolve_metric_bundle(metric_fn_bundle)) return resolved_metrics
def get_domain_records( self, domain_kwargs: dict, ) -> DataFrame: """ Uses the given domain kwargs (which include row_condition, condition_parser, and ignore_row_if directives) to obtain and/or query a batch. Returns in the format of a Spark DataFrame. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain Returns: A DataFrame (the data on which to compute) """ table = domain_kwargs.get("table", None) if table: raise ValueError( "SparkDFExecutionEngine does not currently support multiple named tables." ) batch_id = domain_kwargs.get("batch_id") if batch_id is None: # We allow no batch id specified if there is only one batch if self.active_batch_data: data = self.active_batch_data.dataframe else: raise ValidationError( "No batch is specified, but could not identify a loaded batch." ) else: if batch_id in self.loaded_batch_data_dict: data = self.loaded_batch_data_dict[batch_id].dataframe else: raise ValidationError(f"Unable to find batch with batch_id {batch_id}") # Filtering by row condition. row_condition = domain_kwargs.get("row_condition", None) if row_condition: condition_parser = domain_kwargs.get("condition_parser", None) if condition_parser == "spark": data = data.filter(row_condition) elif condition_parser == "great_expectations__experimental__": parsed_condition = parse_condition_to_spark(row_condition) data = data.filter(parsed_condition) else: raise GreatExpectationsError( f"unrecognized condition_parser {str(condition_parser)} for Spark execution engine" ) # Filtering by filter_conditions filter_conditions: List[RowCondition] = domain_kwargs.get( "filter_conditions", [] ) if len(filter_conditions) > 0: filter_condition = self._combine_row_conditions(filter_conditions) data = data.filter(filter_condition.condition) if "column" in domain_kwargs: return data # Filtering by ignore_row_if directive if ( "column_A" in domain_kwargs and "column_B" in domain_kwargs and "ignore_row_if" in domain_kwargs ): # noinspection PyPep8Naming column_A_name = domain_kwargs["column_A"] # noinspection PyPep8Naming column_B_name = domain_kwargs["column_B"] ignore_row_if = domain_kwargs["ignore_row_if"] if ignore_row_if == "both_values_are_missing": ignore_condition = ( F.col(column_A_name).isNull() & F.col(column_B_name).isNull() ) data = data.filter(~ignore_condition) elif ignore_row_if == "either_value_is_missing": ignore_condition = ( F.col(column_A_name).isNull() | F.col(column_B_name).isNull() ) data = data.filter(~ignore_condition) else: if ignore_row_if not in ["neither", "never"]: raise ValueError( f'Unrecognized value of ignore_row_if ("{ignore_row_if}").' ) if ignore_row_if == "never": # deprecated-v0.13.29 warnings.warn( f"""The correct "no-action" value of the "ignore_row_if" directive for the column pair case is \ "neither" (the use of "{ignore_row_if}" is deprecated as of v0.13.29 and will be removed in v0.16). Please use "neither" moving forward. """, DeprecationWarning, ) return data if "column_list" in domain_kwargs and "ignore_row_if" in domain_kwargs: column_list = domain_kwargs["column_list"] ignore_row_if = domain_kwargs["ignore_row_if"] if ignore_row_if == "all_values_are_missing": conditions = [ F.col(column_name).isNull() for column_name in column_list ] ignore_condition = reduce(lambda a, b: a & b, conditions) data = data.filter(~ignore_condition) elif ignore_row_if == "any_value_is_missing": conditions = [ F.col(column_name).isNull() for column_name in column_list ] ignore_condition = reduce(lambda a, b: a | b, conditions) data = data.filter(~ignore_condition) else: if ignore_row_if != "never": raise ValueError( f'Unrecognized value of ignore_row_if ("{ignore_row_if}").' ) return data return data
def get_domain_records( self, domain_kwargs: Dict, ) -> Selectable: """ Uses the given domain kwargs (which include row_condition, condition_parser, and ignore_row_if directives) to obtain and/or query a batch. Returns in the format of an SqlAlchemy table/column(s) object. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain Returns: An SqlAlchemy table/column(s) (the selectable object for obtaining data on which to compute) """ batch_id = domain_kwargs.get("batch_id") if batch_id is None: # We allow no batch id specified if there is only one batch if self.active_batch_data: data_object = self.active_batch_data else: raise GreatExpectationsError( "No batch is specified, but could not identify a loaded batch." ) else: if batch_id in self.loaded_batch_data_dict: data_object = self.loaded_batch_data_dict[batch_id] else: raise GreatExpectationsError( f"Unable to find batch with batch_id {batch_id}" ) selectable: Selectable if "table" in domain_kwargs and domain_kwargs["table"] is not None: # TODO: Add logic to handle record_set_name once implemented # (i.e. multiple record sets (tables) in one batch if domain_kwargs["table"] != data_object.selectable.name: selectable = sa.Table( domain_kwargs["table"], sa.MetaData(), schema=data_object._schema_name, ) else: selectable = data_object.selectable elif "query" in domain_kwargs: raise ValueError( "query is not currently supported by SqlAlchemyExecutionEngine" ) else: selectable = data_object.selectable """ If a custom query is passed, selectable will be TextClause and not formatted as a subquery wrapped in "(subquery) alias". TextClause must first be converted to TextualSelect using sa.columns() before it can be converted to type Subquery """ if TextClause and isinstance(selectable, TextClause): selectable = selectable.columns().subquery() # Filtering by row condition. if ( "row_condition" in domain_kwargs and domain_kwargs["row_condition"] is not None ): condition_parser = domain_kwargs["condition_parser"] if condition_parser == "great_expectations__experimental__": parsed_condition = parse_condition_to_sqlalchemy( domain_kwargs["row_condition"] ) selectable = ( sa.select([sa.text("*")]) .select_from(selectable) .where(parsed_condition) ) else: raise GreatExpectationsError( "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser." ) # Filtering by filter_conditions filter_conditions: List[RowCondition] = domain_kwargs.get( "filter_conditions", [] ) # For SqlAlchemyExecutionEngine only one filter condition is allowed if len(filter_conditions) == 1: filter_condition = filter_conditions[0] assert ( filter_condition.condition_type == RowConditionParserType.GE ), "filter_condition must be of type GE for SqlAlchemyExecutionEngine" selectable = ( sa.select([sa.text("*")]) .select_from(selectable) .where(parse_condition_to_sqlalchemy(filter_condition.condition)) ) elif len(filter_conditions) > 1: raise GreatExpectationsError( "SqlAlchemyExecutionEngine currently only supports a single filter condition." ) if "column" in domain_kwargs: return selectable # Filtering by ignore_row_if directive if ( "column_A" in domain_kwargs and "column_B" in domain_kwargs and "ignore_row_if" in domain_kwargs ): if self.active_batch_data.use_quoted_name: # Checking if case-sensitive and using appropriate name # noinspection PyPep8Naming column_A_name = quoted_name(domain_kwargs["column_A"], quote=True) # noinspection PyPep8Naming column_B_name = quoted_name(domain_kwargs["column_B"], quote=True) else: # noinspection PyPep8Naming column_A_name = domain_kwargs["column_A"] # noinspection PyPep8Naming column_B_name = domain_kwargs["column_B"] ignore_row_if = domain_kwargs["ignore_row_if"] if ignore_row_if == "both_values_are_missing": selectable = get_sqlalchemy_selectable( sa.select([sa.text("*")]) .select_from(get_sqlalchemy_selectable(selectable)) .where( sa.not_( sa.and_( sa.column(column_A_name) == None, sa.column(column_B_name) == None, ) ) ) ) elif ignore_row_if == "either_value_is_missing": selectable = get_sqlalchemy_selectable( sa.select([sa.text("*")]) .select_from(get_sqlalchemy_selectable(selectable)) .where( sa.not_( sa.or_( sa.column(column_A_name) == None, sa.column(column_B_name) == None, ) ) ) ) else: if ignore_row_if not in ["neither", "never"]: raise ValueError( f'Unrecognized value of ignore_row_if ("{ignore_row_if}").' ) if ignore_row_if == "never": # deprecated-v0.13.29 warnings.warn( f"""The correct "no-action" value of the "ignore_row_if" directive for the column pair case is \ "neither" (the use of "{ignore_row_if}" is deprecated as of v0.13.29 and will be removed in v0.16). Please use "neither" moving forward. """, DeprecationWarning, ) return selectable if "column_list" in domain_kwargs and "ignore_row_if" in domain_kwargs: if self.active_batch_data.use_quoted_name: # Checking if case-sensitive and using appropriate name column_list = [ quoted_name(domain_kwargs[column_name], quote=True) for column_name in domain_kwargs["column_list"] ] else: column_list = domain_kwargs["column_list"] ignore_row_if = domain_kwargs["ignore_row_if"] if ignore_row_if == "all_values_are_missing": selectable = get_sqlalchemy_selectable( sa.select([sa.text("*")]) .select_from(get_sqlalchemy_selectable(selectable)) .where( sa.not_( sa.and_( *( sa.column(column_name) == None for column_name in column_list ) ) ) ) ) elif ignore_row_if == "any_value_is_missing": selectable = get_sqlalchemy_selectable( sa.select([sa.text("*")]) .select_from(get_sqlalchemy_selectable(selectable)) .where( sa.not_( sa.or_( *( sa.column(column_name) == None for column_name in column_list ) ) ) ) ) else: if ignore_row_if != "never": raise ValueError( f'Unrecognized value of ignore_row_if ("{ignore_row_if}").' ) return selectable return selectable
def get_compute_domain( self, domain_kwargs: Dict, domain_type: Union[str, "MetricDomainTypes"], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple["sa.sql.Selectable", dict, dict]: """Uses a given batch dictionary and domain kwargs to obtain a SqlAlchemy column object. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or "MetricDomainTypes") - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: SqlAlchemy column """ # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) batch_id = domain_kwargs.get("batch_id") if batch_id is None: # We allow no batch id specified if there is only one batch if self.active_batch_data: data_object = self.active_batch_data else: raise GreatExpectationsError( "No batch is specified, but could not identify a loaded batch." ) else: if batch_id in self.loaded_batch_data_dict: data_object = self.loaded_batch_data_dict[batch_id] else: raise GreatExpectationsError( f"Unable to find batch with batch_id {batch_id}" ) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = dict() if "table" in domain_kwargs and domain_kwargs["table"] is not None: if domain_kwargs["table"] != data_object.record_set_name: raise ValueError("Unrecognized table name.") else: selectable = data_object.selectable elif "query" in domain_kwargs: raise ValueError( "query is not currently supported by SqlAlchemyExecutionEngine" ) else: selectable = data_object.selectable if ( "row_condition" in domain_kwargs and domain_kwargs["row_condition"] is not None ): condition_parser = domain_kwargs["condition_parser"] if condition_parser == "great_expectations__experimental__": parsed_condition = parse_condition_to_sqlalchemy( domain_kwargs["row_condition"] ) selectable = sa.select( "*", from_obj=selectable, whereclause=parsed_condition ) else: raise GreatExpectationsError( "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser." ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if ( domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(accessor_keys) > 0 ): logger.warning( "Accessor keys ignored since Metric Domain Type is not 'table'" ) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(accessor_keys) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key) if len(domain_kwargs.keys()) > 0: for key in compute_domain_kwargs.keys(): # Warning user if kwarg not "normal" if key not in [ "batch_id", "table", "row_condition", "condition_parser", ]: logger.warning( f"Unexpected key {key} found in domain_kwargs for domain type {domain_type.value}" ) return selectable, compute_domain_kwargs, accessor_domain_kwargs # If user has stated they want a column, checking if one is provided, and elif domain_type == MetricDomainTypes.COLUMN: if "column" in compute_domain_kwargs: # Checking if case- sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column"] = quoted_name( compute_domain_kwargs.pop("column") ) else: accessor_domain_kwargs["column"] = compute_domain_kwargs.pop( "column" ) else: # If column not given raise GreatExpectationsError( "Column not provided in compute_domain_kwargs" ) # Else, if column pair values requested elif domain_type == MetricDomainTypes.COLUMN_PAIR: # Ensuring column_A and column_B parameters provided if ( "column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs ): if self.active_batch_data.use_quoted_name: # If case matters... accessor_domain_kwargs["column_A"] = quoted_name( compute_domain_kwargs.pop("column_A") ) accessor_domain_kwargs["column_B"] = quoted_name( compute_domain_kwargs.pop("column_B") ) else: accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A" ) accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B" ) else: raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) # Checking if table or identity or other provided, column is not specified. If it is, warning the user elif domain_type == MetricDomainTypes.MULTICOLUMN: if "columns" in compute_domain_kwargs: # If columns exist accessor_domain_kwargs["columns"] = compute_domain_kwargs.pop("columns") # Filtering if identity elif domain_type == MetricDomainTypes.IDENTITY: # If we would like our data to become a single column if "column" in compute_domain_kwargs: if self.active_batch_data.use_quoted_name: selectable = sa.select( [sa.column(quoted_name(compute_domain_kwargs["column"]))] ).select_from(selectable) else: selectable = sa.select( [sa.column(compute_domain_kwargs["column"])] ).select_from(selectable) # If we would like our data to now become a column pair elif ("column_A" in compute_domain_kwargs) and ( "column_B" in compute_domain_kwargs ): if self.active_batch_data.use_quoted_name: selectable = sa.select( [ sa.column(quoted_name(compute_domain_kwargs["column_A"])), sa.column(quoted_name(compute_domain_kwargs["column_B"])), ] ).select_from(selectable) else: selectable = sa.select( [ sa.column(compute_domain_kwargs["column_A"]), sa.column(compute_domain_kwargs["column_B"]), ] ).select_from(selectable) else: # If we would like our data to become a multicolumn if "columns" in compute_domain_kwargs: if self.active_batch_data.use_quoted_name: # Building a list of column objects used for sql alchemy selection to_select = [ sa.column(quoted_name(col)) for col in compute_domain_kwargs["columns"] ] selectable = sa.select(to_select).select_from(selectable) else: to_select = [ sa.column(col) for col in compute_domain_kwargs["columns"] ] selectable = sa.select(to_select).select_from(selectable) # Letting selectable fall through return selectable, compute_domain_kwargs, accessor_domain_kwargs
def get_compute_domain( self, domain_kwargs: Dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple[Selectable, dict, dict]: """Uses a given batch dictionary and domain kwargs to obtain a SqlAlchemy column object. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: SqlAlchemy column """ selectable = self.get_domain_records(domain_kwargs=domain_kwargs, ) # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if (domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(list(accessor_keys)) > 0): logger.warning( 'Accessor keys ignored since Metric Domain Type is not "table"' ) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = {} if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(list(accessor_keys)) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop( key) if len(domain_kwargs.keys()) > 0: # Warn user if kwarg not "normal". unexpected_keys: set = set( compute_domain_kwargs.keys()).difference({ "batch_id", "table", "row_condition", "condition_parser", }) if len(unexpected_keys) > 0: unexpected_keys_str: str = ", ".join( map(lambda element: f'"{element}"', unexpected_keys)) logger.warning( f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".' ) return selectable, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN: if "column" not in compute_domain_kwargs: raise GreatExpectationsError( "Column not provided in compute_domain_kwargs") # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column"] = quoted_name( compute_domain_kwargs.pop("column"), quote=True) else: accessor_domain_kwargs["column"] = compute_domain_kwargs.pop( "column") return selectable, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN_PAIR: if not ("column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs): raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column_A"] = quoted_name( compute_domain_kwargs.pop("column_A"), quote=True) accessor_domain_kwargs["column_B"] = quoted_name( compute_domain_kwargs.pop("column_B"), quote=True) else: accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B") return selectable, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.MULTICOLUMN: if "column_list" not in domain_kwargs: raise GreatExpectationsError( "column_list not found within domain_kwargs") column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise GreatExpectationsError( "column_list must contain at least 2 columns") # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column_list"] = [ quoted_name(column_name, quote=True) for column_name in column_list ] else: accessor_domain_kwargs["column_list"] = column_list return selectable, compute_domain_kwargs, accessor_domain_kwargs # Letting selectable fall through return selectable, compute_domain_kwargs, accessor_domain_kwargs