def __init__(self, expectation_type, kwargs, meta=None, success_on_last_run=None): if not isinstance(expectation_type, string_types): raise InvalidExpectationConfigurationError("expectation_type must be a string") self._expectation_type = expectation_type if not isinstance(kwargs, dict): raise InvalidExpectationConfigurationError("expectation configuration kwargs must be an " "ExpectationKwargs object.") self._kwargs = ExpectationKwargs(kwargs) if meta is None: meta = {} # We require meta information to be serializable, but do not convert until necessary ensure_json_serializable(meta) self.meta = meta self.success_on_last_run = success_on_last_run
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]): super().validate_configuration(configuration) try: assert ( "quantile_ranges" in configuration.kwargs), "quantile ranges must be provided" assert isinstance(configuration.kwargs["quantile_ranges"], dict), "quantile_ranges should be a dictionary" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) # Ensuring actual quantiles and their value ranges match up quantile_ranges = configuration.kwargs["quantile_ranges"] quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] if "allow_relative_error" in configuration.kwargs: allow_relative_error = configuration.kwargs["allow_relative_error"] else: allow_relative_error = False if allow_relative_error is not False: raise ValueError( "PandasExecutionEngine does not support relative error in column quantiles." ) if len(quantiles) != len(quantile_value_ranges): raise ValueError( "quntile_values and quantiles must have the same number of elements" ) return True
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) if configuration is None: configuration = self.configuration try: assert (configuration.kwargs.get("min_value") is not None or configuration.kwargs.get("max_value") is not None), "min_value and max_value cannot both be None" if configuration.kwargs.get("min_value"): assert (isinstance( configuration.kwargs["min_value"], dict) or float( configuration.kwargs.get("min_value")).is_integer() ), "min_value and max_value must be integers" if isinstance(configuration.kwargs.get("min_value"), dict): assert "$PARAMETER" in configuration.kwargs.get( "min_value" ), 'Evaluation Parameter dict for min_value kwarg must have "$PARAMETER" key.' if configuration.kwargs.get("max_value"): assert (isinstance( configuration.kwargs["max_value"], dict) or float( configuration.kwargs.get("max_value")).is_integer() ), "min_value and max_value must be integers" if isinstance(configuration.kwargs.get("max_value"), dict): assert "$PARAMETER" in configuration.kwargs.get( "max_value" ), 'Evaluation Parameter dict for max_value kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e))
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ super().validate_configuration(configuration) if configuration is None: configuration = self.configuration strict = configuration.kwargs.get("strict") # Check other things in configuration.kwargs and raise Exceptions if needed try: assert (isinstance(strict, bool) or strict is None), "strict must be a boolean value" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e))
def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): super().validate_configuration(configuration) try: assert "type_" in configuration.kwargs, "type_ is required" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) return True
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) try: assert ( "quantile_ranges" in configuration.kwargs), "quantile_ranges must be provided" assert isinstance(configuration.kwargs["quantile_ranges"], dict), "quantile_ranges should be a dictionary" assert all([ True if None in x or x == sorted(x) else False for x in configuration.kwargs["quantile_ranges"]["value_ranges"] ]), "quantile_ranges must consist of ordered pairs" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) # Ensuring actual quantiles and their value ranges match up quantile_ranges = configuration.kwargs["quantile_ranges"] quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] if len(quantiles) != len(quantile_value_ranges): raise ValueError( "quantile_values and quantiles must have the same number of elements" )
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]): """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: True if the configuration has been validated successfully. Otherwise, raises an exception """ # Setting up a configuration super().validate_configuration(configuration) # Ensuring that a proper value has been provided try: assert "column_set" in configuration.kwargs, "column_set is required" assert (isinstance(configuration.kwargs["column_set"], (list, set, dict)) or configuration.kwargs["column_set"] is None ), "column_set must be a list, set, or None" if isinstance(configuration.kwargs["column_set"], dict): assert ( "$PARAMETER" in configuration.kwargs["column_set"] ), 'Evaluation Parameter dict for column_set kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) return True
def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: True if the configuration has been validated successfully. Otherwise, raises an exception """ try: assert ( "other_table_name" in configuration.kwargs ), "other_table_name is required" assert isinstance( configuration.kwargs["other_table_name"], str ), "other_table_name must be a string" if "ignore_columns" in configuration.kwargs: pattern = re.compile(r"^(\w+)(,\s*\w+)*$") assert ( True if (pattern.match(configuration.kwargs["ignore_columns"])) else False ), "ignore_columns input is not valid. Please provide comma seperated columns list" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) super().validate_configuration(configuration) return True
def find_expectation_indexes( self, expectation_configuration: ExpectationConfiguration, match_type: str = "domain", ) -> List[int]: """ Args: expectation_configuration: A potentially incomplete (partial) Expectation Configuration to match against to find the index of any matching Expectation Configurations on the suite. match_type: This determines what kwargs to use when matching. Options are 'domain' to match based on the data evaluated by that expectation, 'success' to match based on all configuration parameters that influence whether an expectation succeeds based on a given batch of data, and 'runtime' to match based on all configuration parameters Returns: A list of indexes of matching ExpectationConfiguration Raises: InvalidExpectationConfigurationError """ if not isinstance(expectation_configuration, ExpectationConfiguration): raise InvalidExpectationConfigurationError( "Ensure that expectation configuration is valid." ) match_indexes = [] for idx, expectation in enumerate(self.expectations): if expectation.isEquivalentTo(expectation_configuration, match_type): match_indexes.append(idx) return match_indexes
def __init__(self, expectation_type, kwargs, meta=None, success_on_last_run=None): if not isinstance(expectation_type, str): raise InvalidExpectationConfigurationError( "expectation_type must be a string" ) self._expectation_type = expectation_type if not isinstance(kwargs, dict): raise InvalidExpectationConfigurationError( "expectation configuration kwargs must be a dict." ) self._kwargs = kwargs self._raw_kwargs = None # the kwargs before evaluation parameters are evaluated if meta is None: meta = {} # We require meta information to be serializable, but do not convert until necessary ensure_json_serializable(meta) self.meta = meta self.success_on_last_run = success_on_last_run
def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): # Ensuring basic configuration parameters are properly set try: assert ( "column" in configuration.kwargs ), "'column' parameter is required for column expectations" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) return True
def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): if configuration is None: configuration = self.configuration try: assert configuration.expectation_type == self.expectation_type, ( "expectation configuration type does not match " "expectation type" ) except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) return True
def validate_metric_value_between_configuration( self, configuration: Optional[ExpectationConfiguration] ): # Validating that Minimum and Maximum values are of the proper format and type min_val = None max_val = None if "min_value" in configuration.kwargs: min_val = configuration.kwargs["min_value"] if "max_value" in configuration.kwargs: max_val = configuration.kwargs["max_value"] try: # Ensuring Proper interval has been provided assert min_val is None or isinstance( min_val, (float, int, dict) ), "Provided min threshold must be a number" if isinstance(min_val, dict): assert ( "$PARAMETER" in min_val ), 'Evaluation Parameter dict for min_value kwarg must have "$PARAMETER" key' assert max_val is None or isinstance( max_val, (float, int, dict) ), "Provided max threshold must be a number" if isinstance(max_val, dict): assert "$PARAMETER" in max_val, ( "Evaluation Parameter dict for max_value " "kwarg " 'must have "$PARAMETER" key' ) except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) if min_val is not None and max_val is not None and min_val > max_val: raise InvalidExpectationConfigurationError( "Minimum Threshold cannot be larger than Maximum Threshold" ) return True
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]): super().validate_configuration(configuration) try: assert "type_list" in configuration.kwargs, "type_list is required" assert (isinstance(configuration.kwargs["type_list"], list) or configuration.kwargs["type_list"] is None ), "type_list must be a list or None" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) return True
def ensure_row_condition_is_correct(row_condition_string) -> None: """Ensure no quote nor \\\\n are introduced in row_condition string. Otherwise it may cause an issue at the reload of the expectation. An error is raised at the declaration of the expectations to ensure the user is not doing a mistake. He can use double quotes for example. Parameters ---------- row_condition_string : str the pandas query string """ if "'" in row_condition_string: raise InvalidExpectationConfigurationError( f"{row_condition_string} cannot be serialized to json. " "Do not introduce simple quotes in configuration." "Use double quotes instead.") if "\n" in row_condition_string: raise InvalidExpectationConfigurationError( f"{repr(row_condition_string)} cannot be serialized to json. Do not introduce \\n in configuration." )
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) try: assert "type_list" in configuration.kwargs, "type_list is required" assert (isinstance(configuration.kwargs["type_list"], (list, dict)) or configuration.kwargs["type_list"] is None ), "type_list must be a list or None" if isinstance(configuration.kwargs["type_list"], dict): assert ( "$PARAMETER" in configuration.kwargs["type_list"] ), 'Evaluation Parameter dict for type_list kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e))
def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): super().validate_configuration(configuration) # Ensuring necessary parameters are present and of the proper type min_val = None max_val = None # Testing that proper thresholds are in place if "min_value" in configuration.kwargs: min_val = configuration.kwargs["min_value"] if "max_value" in configuration.kwargs: max_val = configuration.kwargs["max_value"] try: assert ( "column" in configuration.kwargs ), "'column' parameter is required for metric" assert ( min_val is not None or max_val is not None ), "min_value and max_value cannot both be none" assert ( "quantile_ranges" in configuration.kwargs ), "quantile ranges must be provided" assert ( type(configuration.kwargs["quantile_ranges"]) == dict ), "quantile_ranges should be a dictionary" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) # Ensuring actual quantiles and their value ranges match up quantile_ranges = configuration.kwargs["quantile_ranges"] quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] if "allow_relative_error" in configuration.kwargs: allow_relative_error = configuration.kwargs["allow_relative_error"] else: allow_relative_error = False if allow_relative_error is not False: raise ValueError( "PandasExecutionEngine does not support relative error in column quantiles." ) if len(quantiles) != len(quantile_value_ranges): raise ValueError( "quntile_values and quantiles must have the same number of elements" ) return True
def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): if not super().validate_configuration(configuration): return False try: assert ( "column" in configuration.kwargs ), "'column' parameter is required for column map expectations" if "mostly" in configuration.kwargs: mostly = configuration.kwargs["mostly"] assert isinstance( mostly, (int, float) ), "'mostly' parameter must be an integer or float" assert 0 <= mostly <= 1, "'mostly' parameter must be between 0 and 1" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) return True
def find_expectation_indexes( self, expectation_configuration: Optional[ExpectationConfiguration] = None, match_type: str = "domain", ge_cloud_id: str = None, ) -> List[int]: """ Find indexes of Expectations matching the given ExpectationConfiguration on the given match_type. If a ge_cloud_id is provided, match_type is ignored and only indexes of Expectations with matching ge_cloud_id are returned. Args: expectation_configuration: A potentially incomplete (partial) Expectation Configuration to match against to find the index of any matching Expectation Configurations on the suite. match_type: This determines what kwargs to use when matching. Options are 'domain' to match based on the data evaluated by that expectation, 'success' to match based on all configuration parameters that influence whether an expectation succeeds based on a given batch of data, and 'runtime' to match based on all configuration parameters ge_cloud_id: Great Expectations Cloud id Returns: A list of indexes of matching ExpectationConfiguration Raises: InvalidExpectationConfigurationError """ if expectation_configuration is None and ge_cloud_id is None: raise TypeError( "Must provide either expectation_configuration or ge_cloud_id") if expectation_configuration and not isinstance( expectation_configuration, ExpectationConfiguration): raise InvalidExpectationConfigurationError( "Ensure that expectation configuration is valid.") match_indexes = [] for idx, expectation in enumerate(self.expectations): if ge_cloud_id is not None: if str(expectation.ge_cloud_id) == str(ge_cloud_id): match_indexes.append(idx) else: if expectation.isEquivalentTo(other=expectation_configuration, match_type=match_type): match_indexes.append(idx) return match_indexes
def validate_configuration( self, configuration: Optional[ExpectationConfiguration] ) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ super().validate_configuration(configuration) if configuration is None: configuration = self.configuration n_features = configuration.kwargs.get("n_features") columns = configuration.kwargs.get("important_columns") threshold = configuration.kwargs.get("threshold") y_column = configuration.kwargs.get("y_column") try: assert ( columns is not None or threshold is not None ), "at least one of important_columns or threshold is required" assert ( isinstance(n_features, int) or n_features is None ), "n_features must be an integer" if columns is not None: assert ( isinstance(columns, tuple) or isinstance(columns, list) ) and all( isinstance(i, str) for i in columns ), "columns must be a tuple or list of string column names" assert ( isinstance(threshold, float) and (0 <= threshold <= 1) ) or threshold is None, "threshold must be a float between 0 and 1" assert y_column is not None, "target y_column must be specified" assert isinstance(y_column, str), "y_column must be a string column name" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) super().validate_configuration(configuration)
def validate_configuration( self, configuration: Optional[ExpectationConfiguration]): """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: True if the configuration has been validated successfully. Otherwise, raises an exception """ # # Setting up a configuration try: assert "user_input" in configuration.kwargs, "user_input is required" assert isinstance(configuration.kwargs["user_input"], str), "user_input must be a string" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) super().validate_configuration(configuration) return True
def ensure_json_serializable(data): """ Helper function to convert an object to one that is json serializable Args: data: an object to attempt to convert a corresponding json-serializable object Returns: (dict) A converted test_object Warning: test_obj may also be converted in place. """ import numpy as np import pandas as pd import datetime import decimal # If it's one of our types, we use our own conversion; this can move to full schema # once nesting goes all the way down if isinstance( data, ( ExpectationConfiguration, ExpectationSuite, ExpectationValidationResult, ExpectationSuiteValidationResult, ), ): return try: if not isinstance(data, list) and np.isnan(data): # np.isnan is functionally vectorized, but we only want to apply this to single objects # Hence, why we test for `not isinstance(list))` return except TypeError: pass except ValueError: pass if isinstance(data, (str, int, float, bool)): # No problem to encode json return elif isinstance(data, dict): for key in data: str(key) # key must be cast-able to string ensure_json_serializable(data[key]) return elif isinstance(data, (list, tuple, set)): for val in data: ensure_json_serializable(val) return elif isinstance(data, (np.ndarray, pd.Index)): # test_obj[key] = test_obj[key].tolist() # If we have an array or index, convert it first to a list--causing coercion to float--and then round # to the number of digits for which the string representation will equal the float representation _ = [ensure_json_serializable(x) for x in data.tolist()] return # Note: This clause has to come after checking for np.ndarray or we get: # `ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()` elif data is None: # No problem to encode json return elif isinstance(data, (datetime.datetime, datetime.date)): return # Use built in base type from numpy, https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html # https://github.com/numpy/numpy/pull/9505 elif np.issubdtype(type(data), np.bool_): return elif np.issubdtype(type(data), np.integer) or np.issubdtype( type(data), np.uint): return elif np.issubdtype(type(data), np.floating): # Note: Use np.floating to avoid FutureWarning from numpy return elif isinstance(data, pd.Series): # Converting a series is tricky since the index may not be a string, but all json # keys must be strings. So, we use a very ugly serialization strategy index_name = data.index.name or "index" value_name = data.name or "value" _ = [{ index_name: ensure_json_serializable(idx), value_name: ensure_json_serializable(val), } for idx, val in data.iteritems()] return elif isinstance(data, pd.DataFrame): return ensure_json_serializable(data.to_dict(orient="records")) elif isinstance(data, decimal.Decimal): return else: raise InvalidExpectationConfigurationError( "%s is of type %s which cannot be serialized to json" % (str(data), type(data).__name__))
def ensure_json_serializable(data): """ Helper function to convert an object to one that is json serializable Args: data: an object to attempt to convert a corresponding json-serializable object Returns: (dict) A converted test_object Warning: test_obj may also be converted in place. """ if isinstance(data, SerializableDictDot): return try: if not isinstance(data, list) and pd.isna(data): # pd.isna is functionally vectorized, but we only want to apply this to single objects # Hence, why we test for `not isinstance(list))` return except TypeError: pass except ValueError: pass if isinstance(data, ((str, ), (int, ), float, bool)): # No problem to encode json return elif isinstance(data, dict): for key in data: str(key) # key must be cast-able to string ensure_json_serializable(data[key]) return elif isinstance(data, (list, tuple, set)): for val in data: ensure_json_serializable(val) return elif isinstance(data, (np.ndarray, pd.Index)): # test_obj[key] = test_obj[key].tolist() # If we have an array or index, convert it first to a list--causing coercion to float--and then round # to the number of digits for which the string representation will equal the float representation _ = [ensure_json_serializable(x) for x in data.tolist()] return # Note: This clause has to come after checking for np.ndarray or we get: # `ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()` elif data is None: # No problem to encode json return elif isinstance(data, (datetime.datetime, datetime.date)): return # Use built in base type from numpy, https://docs.scipy.org/doc/numpy-1.13.0/user/basics.types.html # https://github.com/numpy/numpy/pull/9505 elif np.issubdtype(type(data), np.bool_): return elif np.issubdtype(type(data), np.integer) or np.issubdtype( type(data), np.uint): return elif np.issubdtype(type(data), np.floating): # Note: Use np.floating to avoid FutureWarning from numpy return elif isinstance(data, pd.Series): # Converting a series is tricky since the index may not be a string, but all json # keys must be strings. So, we use a very ugly serialization strategy index_name = data.index.name or "index" value_name = data.name or "value" _ = [{ index_name: ensure_json_serializable(idx), value_name: ensure_json_serializable(val), } for idx, val in data.iteritems()] return elif pyspark and isinstance(data, pyspark.sql.DataFrame): # using StackOverflow suggestion for converting pyspark df into dictionary # https://stackoverflow.com/questions/43679880/pyspark-dataframe-to-dictionary-columns-as-keys-and-list-of-column-values-ad-di return ensure_json_serializable( dict(zip(data.schema.names, zip(*data.collect())))) elif isinstance(data, pd.DataFrame): return ensure_json_serializable(data.to_dict(orient="records")) elif isinstance(data, decimal.Decimal): return elif isinstance(data, RunIdentifier): return else: raise InvalidExpectationConfigurationError( "%s is of type %s which cannot be serialized to json" % (str(data), type(data).__name__))
def configuration(self): if self._configuration is None: raise InvalidExpectationConfigurationError( "cannot access configuration: expectation has not yet been configured" ) return self._configuration