def _spark( cls, execution_engine: "SparkDFExecutionEngine", metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): ignore_row_if = metric_value_kwargs["ignore_row_if"] compute_domain_kwargs = copy.deepcopy(metric_domain_kwargs) if ignore_row_if == "both_values_are_missing": compute_domain_kwargs["row_condition"] = ( F.col(metric_domain_kwargs["column_A"]).isNotNull() & F.col(metric_domain_kwargs["column_B"]).isNotNull()) compute_domain_kwargs["condition_parser"] = "spark" elif ignore_row_if == "either_value_is_missing": compute_domain_kwargs["row_condition"] = ( F.col(metric_domain_kwargs["column_A"]).isNotNull() | F.col(metric_domain_kwargs["column_B"]).isNotNull()) compute_domain_kwargs["condition_parser"] = "spark" ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(compute_domain_kwargs, MetricDomainTypes.COLUMN_PAIR) return ( df[metric_domain_kwargs["column_A"]] == df[ metric_domain_kwargs["column_B"]], compute_domain_kwargs, accessor_domain_kwargs, )
def _spark(cls, column_A, column_B, **kwargs): allow_cross_type_comparisons: bool = ( kwargs.get("allow_cross_type_comparisons") or False ) if allow_cross_type_comparisons: raise NotImplementedError parse_strings_as_datetimes: bool = ( kwargs.get("parse_strings_as_datetimes") or False ) if parse_strings_as_datetimes: warnings.warn( """The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \ future release. Please update code accordingly. """, DeprecationWarning, ) temp_column_A = F.to_date(column_A) temp_column_B = F.to_date(column_B) else: temp_column_A = column_A temp_column_B = column_B or_equal: bool = kwargs.get("or_equal") or False if or_equal: return (temp_column_A >= temp_column_B) | ( temp_column_A.eqNullSafe(temp_column_B) ) else: return temp_column_A > temp_column_B
def _spark(cls, column_A, column_B, **kwargs): allow_cross_type_comparisons: bool = ( kwargs.get("allow_cross_type_comparisons") or False ) if allow_cross_type_comparisons: raise NotImplementedError parse_strings_as_datetimes: bool = ( kwargs.get("parse_strings_as_datetimes") or False ) if parse_strings_as_datetimes: # deprecated-v0.13.41 warnings.warn( """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \ v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \ please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/ """, DeprecationWarning, ) temp_column_A = F.to_date(column_A) temp_column_B = F.to_date(column_B) else: temp_column_A = column_A temp_column_B = column_B or_equal: bool = kwargs.get("or_equal") or False if or_equal: return (temp_column_A >= temp_column_B) | ( temp_column_A.eqNullSafe(temp_column_B) ) else: return temp_column_A > temp_column_B
def _spark(cls, column_list, **kwargs): sum_total = kwargs.get("sum_total") expression = "+".join([ f"COALESCE({column_name}, 0)" for column_name in column_list.columns ]) row_wise_cond = F.expr(expression) == F.lit(sum_total) return row_wise_cond
def _spark( cls, execution_engine: "SqlAlchemyExecutionEngine", metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): return F.count(F.lit(1)), metric_domain_kwargs, dict()
def _spark(cls, column_list, **kwargs): column_names = column_list.columns num_columns = len(column_names) conditions = [] for idx_src in range(num_columns - 1): for idx_dest in range(idx_src + 1, num_columns): conditions.append( F.col(column_names[idx_src]).eqNullSafe( F.col(column_names[idx_dest]))) row_wise_cond = ~reduce(lambda a, b: a | b, conditions) return row_wise_cond
def _spark(cls, column_A, column_B, **kwargs): value_pairs_set = kwargs.get("value_pairs_set") if value_pairs_set is None: # vacuously true return column_A == column_B value_pairs_set = [(x, y) for x, y in value_pairs_set] conditions = [ (column_A.eqNullSafe(F.lit(x)) & column_B.eqNullSafe(F.lit(y))) for x, y in value_pairs_set ] row_wise_cond = reduce(lambda a, b: a | b, conditions) return row_wise_cond
def _spark(cls, column, **kwargs): def is_ascii(val): return str(val).isascii() is_ascii_udf = F.udf(is_ascii, sparktypes.BooleanType()) return is_ascii_udf(column)
def _spark(cls, column, strftime_format, **kwargs): # Below is a simple validation that the provided format can both format and parse a datetime object. # %D is an example of a format that can format but not parse, e.g. try: datetime.strptime( datetime.strftime(datetime.now(), strftime_format), strftime_format) except ValueError as e: raise ValueError( f"Unable to use provided strftime_format: {str(e)}") def is_parseable_by_format(val): if val is None: return False try: datetime.strptime(val, strftime_format) return True except TypeError: raise TypeError( "Values passed to expect_column_values_to_match_strftime_format must be of type string.\nIf you want to validate a column of dates or timestamps, please call the expectation before converting from string format." ) except ValueError: return False success_udf = F.udf(is_parseable_by_format, sparktypes.BooleanType()) return success_udf(column)
def _spark( cls, column, value_set, **kwargs, ): # no need to parse as datetime; just compare the strings as is parse_strings_as_datetimes: bool = ( kwargs.get("parse_strings_as_datetimes") or False ) if parse_strings_as_datetimes: # deprecated-v0.13.41 warnings.warn( """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \ v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \ please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/ """, DeprecationWarning, ) if value_set is None: # vacuously true return F.lit(True) return column.isin(value_set)
def _spark_condition(cls, column, _metrics, threshold, double_sided, **kwargs): z_score, _, _ = _metrics["column_values.z_score.map"] if double_sided: threshold = abs(threshold) z_score = F.abs(z_score) return z_score < threshold
def _spark( cls, execution_engine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance(column_metadata["type"], (sparktypes.StringType)): column = F.col(column_name).cast(sparktypes.IntegerType()) else: raise TypeError( "Column must be a string-type capable of being cast to int.") compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) if any(np.array(df.select(column.isNull()).collect())): raise TypeError( "Column must be a string-type capable of being cast to int.") diff = column - F.lag(column).over(Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), 1).otherwise(diff) if metric_value_kwargs["strictly"] is True: diff = F.when(diff <= 0, F.lit(False)).otherwise(F.lit(True)) else: diff = F.when(diff < 0, F.lit(False)).otherwise(F.lit(True)) return ( np.array(df.select(diff).collect()).reshape(-1)[1:], compute_domain_kwargs, accessor_domain_kwargs, )
def _spark(cls, column, **kwargs): def is_xml(val): try: xml_doc = etree.fromstring(val) return True except: return False is_xml_udf = F.udf(is_xml, sparktypes.BooleanType()) return is_xml_udf(column)
def _spark(cls, column, json_schema, **kwargs): def is_json(val): try: json.loads(val) return True except: return False is_json_udf = F.udf(is_json, sparktypes.BooleanType()) return is_json_udf(column)
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column_name = accessor_domain_kwargs["column"] column = F.col(column_name) query = F.when(column == 3, F.lit(False)).otherwise(F.lit(True)) return (query, compute_domain_kwargs, accessor_domain_kwargs)
def _spark(cls, column, **kwargs): center_point = kwargs.get("center_point") unit = kwargs.get("unit") range = kwargs.get("range") projection = kwargs.get("projection") if projection == "fcc": if unit == "kilometers": distances = F.udf( lambda x, y=center_point: fcc_projection(x, y), sparktypes.FloatType(), ) elif unit == "miles": distances = F.udf( lambda x, y=center_point: fcc_projection(x, y) * 1.609344, sparktypes.FloatType(), ) range = range * 1.609344 return F.when(distances(column) < range, F.lit(True)).otherwise(F.lit(False)) elif projection == "pythagorean": if unit == "kilometers": distances = F.udf( lambda x, y=center_point: pythagorean_projection(x, y), sparktypes.FloatType(), ) elif unit == "miles": distances = F.udf( lambda x, y=center_point: pythagorean_projection(x, y) * 1.609344, sparktypes.FloatType(), ) range = range * 1.609344 return F.when(distances(column) < range, F.lit(True)).otherwise(F.lit(False))
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): sort = metric_value_kwargs.get("sort", cls.default_kwarg_values["sort"]) collate = metric_value_kwargs.get("collate", cls.default_kwarg_values["collate"]) if sort not in ["value", "count", "none"]: raise ValueError("sort must be either 'value', 'count', or 'none'") if collate is not None: raise ValueError( "collate parameter is not supported in SparkDFDataset") df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( metric_domain_kwargs, MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] value_counts = (df.select(column).where( F.col(column).isNotNull()).groupBy(column).count()) if sort == "value": value_counts = value_counts.orderBy(column) elif sort == "count": value_counts = value_counts.orderBy(F.desc("count")) value_counts = value_counts.collect() series = pd.Series( [row["count"] for row in value_counts], index=pd.Index(data=[row[column] for row in value_counts], name="value"), name="count", ) return series
def _spark(cls, column, **kwargs): parse_strings_as_datetimes: bool = ( kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: warnings.warn( """The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \ future release. Please update code accordingly. """, DeprecationWarning, ) try: column = apply_dateutil_parse(column=column) except TypeError: pass return F.max(column)
def _spark(cls, column, **kwargs): parse_strings_as_datetimes: bool = ( kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: # deprecated-v0.13.41 warnings.warn( """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \ v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \ please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/ """, DeprecationWarning, ) try: column = apply_dateutil_parse(column=column) except TypeError: pass return F.min(column)
def _spark(cls, column, json_schema, **kwargs): def matches_json_schema(val): if val is None: return False try: val_json = json.loads(val) jsonschema.validate(val_json, json_schema) # jsonschema.validate raises an error if validation fails. # So if we make it this far, we know that the validation succeeded. return True except jsonschema.ValidationError: return False except jsonschema.SchemaError: raise except: raise matches_json_schema_udf = F.udf(matches_json_schema, sparktypes.BooleanType()) return matches_json_schema_udf(column)
def _spark(cls, column, xml_schema, **kwargs): try: xmlschema_doc = etree.fromstring(xml_schema) xmlschema = etree.XMLSchema(xmlschema_doc) except etree.ParseError: raise except: raise def matches_xml_schema(val): if val is None: return False try: xml_doc = etree.fromstring(val) return xmlschema(xml_doc) except: raise matches_xml_schema_udf = F.udf(matches_xml_schema, sparktypes.BooleanType()) return matches_xml_schema_udf(column)
def _spark( cls, column, value_set, **kwargs, ): # no need to parse as datetime; just compare the strings as is parse_strings_as_datetimes: bool = ( kwargs.get("parse_strings_as_datetimes") or False ) if parse_strings_as_datetimes: warnings.warn( f"""The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \ future release. Please update code accordingly. Moreover, in "{cls.__name__}._spark()", it is not used. """, DeprecationWarning, ) if value_set is None: # vacuously true return F.lit(True) return column.isin(value_set)
def _spark( cls, execution_engine: "SparkDFExecutionEngine", metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): ignore_row_if = metric_value_kwargs["ignore_row_if"] compute_domain_kwargs = copy.deepcopy(metric_domain_kwargs) if ignore_row_if == "both_values_are_missing": compute_domain_kwargs["row_condition"] = ( F.col(metric_domain_kwargs["column_A"]).isNotNull() & F.col(metric_domain_kwargs["column_B"]).isNotNull()) compute_domain_kwargs["condition_parser"] = "spark" elif ignore_row_if == "either_value_is_missing": compute_domain_kwargs["row_condition"] = ( F.col(metric_domain_kwargs["column_A"]).isNotNull() | F.col(metric_domain_kwargs["column_B"]).isNotNull()) compute_domain_kwargs["condition_parser"] = "spark" ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN_PAIR) df = df.withColumn( "combined", F.array( F.col(metric_domain_kwargs["column_A"]), F.col(metric_domain_kwargs["column_A"]), ), ) value_set_df = (SQLContext(df._sc).createDataFrame( metric_value_kwargs["value_pairs_set"], ["col_A", "col_B"]).select( F.array("col_A", "col_B").alias("set_AB"))) df = df.join(value_set_df, df["combined"] == value_set_df["set_AB"], "left").withColumn( "__success", F.when(F.col("set_AB").isNull(), F.lit(False)).otherwise(F.lit(True)), ) return df["__success"], compute_domain_kwargs, accessor_domain_kwargs
def _spark_function(cls, column, **kwargs): return F.length(column)
def _spark(cls, column, **kwargs: dict): # type: ignore[no-untyped-def] return F.max(F.length(column))
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): parse_strings_as_datetimes: bool = ( metric_value_kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: warnings.warn( f"""The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \ future release. Please update code accordingly. Moreover, in "{cls.__name__}._spark()", types are detected naturally. """, DeprecationWarning, ) # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(compute_domain_kwargs, MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), -1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition if metric_value_kwargs["strictly"]: return ( F.when(diff >= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or decreasing then unexpected values are those # that are decreasing else: return ( F.when(diff > 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )
def _spark(cls, column, **kwargs): tz_udf = F.udf(is_valid_timezone, sparktypes.BooleanType()) return tz_udf(column)
def _spark(cls, column, **kwargs): return F.count(F.lit(1)).over(Window.partitionBy(column)) <= 1
def _spark(cls, column, **kwargs): """Spark Standard Deviation implementation""" return F.stddev_samp(column)
def _spark(cls, column, _table, _column_name, **kwargs): """Spark Mean Implementation""" types = dict(_table.dtypes) if types[_column_name] not in ("int", "float", "double", "bigint"): raise TypeError("Expected numeric column type for function mean()") return F.mean(column)