Example #1
0
    def _spark(
        cls,
        execution_engine: "SparkDFExecutionEngine",
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        ignore_row_if = metric_value_kwargs["ignore_row_if"]
        compute_domain_kwargs = copy.deepcopy(metric_domain_kwargs)

        if ignore_row_if == "both_values_are_missing":
            compute_domain_kwargs["row_condition"] = (
                F.col(metric_domain_kwargs["column_A"]).isNotNull()
                & F.col(metric_domain_kwargs["column_B"]).isNotNull())
            compute_domain_kwargs["condition_parser"] = "spark"
        elif ignore_row_if == "either_value_is_missing":
            compute_domain_kwargs["row_condition"] = (
                F.col(metric_domain_kwargs["column_A"]).isNotNull()
                | F.col(metric_domain_kwargs["column_B"]).isNotNull())
            compute_domain_kwargs["condition_parser"] = "spark"

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(compute_domain_kwargs,
                                                MetricDomainTypes.COLUMN_PAIR)

        return (
            df[metric_domain_kwargs["column_A"]] == df[
                metric_domain_kwargs["column_B"]],
            compute_domain_kwargs,
            accessor_domain_kwargs,
        )
Example #2
0
    def _spark(cls, column_A, column_B, **kwargs):
        allow_cross_type_comparisons: bool = (
            kwargs.get("allow_cross_type_comparisons") or False
        )
        if allow_cross_type_comparisons:
            raise NotImplementedError

        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False
        )
        if parse_strings_as_datetimes:
            warnings.warn(
                """The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \
future release.  Please update code accordingly.
""",
                DeprecationWarning,
            )

            temp_column_A = F.to_date(column_A)
            temp_column_B = F.to_date(column_B)
        else:
            temp_column_A = column_A
            temp_column_B = column_B

        or_equal: bool = kwargs.get("or_equal") or False
        if or_equal:
            return (temp_column_A >= temp_column_B) | (
                temp_column_A.eqNullSafe(temp_column_B)
            )
        else:
            return temp_column_A > temp_column_B
    def _spark(cls, column_A, column_B, **kwargs):
        allow_cross_type_comparisons: bool = (
            kwargs.get("allow_cross_type_comparisons") or False
        )
        if allow_cross_type_comparisons:
            raise NotImplementedError

        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False
        )
        if parse_strings_as_datetimes:
            # deprecated-v0.13.41
            warnings.warn(
                """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \
v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \
please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/
""",
                DeprecationWarning,
            )

            temp_column_A = F.to_date(column_A)
            temp_column_B = F.to_date(column_B)
        else:
            temp_column_A = column_A
            temp_column_B = column_B

        or_equal: bool = kwargs.get("or_equal") or False
        if or_equal:
            return (temp_column_A >= temp_column_B) | (
                temp_column_A.eqNullSafe(temp_column_B)
            )
        else:
            return temp_column_A > temp_column_B
 def _spark(cls, column_list, **kwargs):
     sum_total = kwargs.get("sum_total")
     expression = "+".join([
         f"COALESCE({column_name}, 0)"
         for column_name in column_list.columns
     ])
     row_wise_cond = F.expr(expression) == F.lit(sum_total)
     return row_wise_cond
Example #5
0
 def _spark(
     cls,
     execution_engine: "SqlAlchemyExecutionEngine",
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[Tuple, Any],
     runtime_configuration: Dict,
 ):
     return F.count(F.lit(1)), metric_domain_kwargs, dict()
    def _spark(cls, column_list, **kwargs):
        column_names = column_list.columns
        num_columns = len(column_names)

        conditions = []
        for idx_src in range(num_columns - 1):
            for idx_dest in range(idx_src + 1, num_columns):
                conditions.append(
                    F.col(column_names[idx_src]).eqNullSafe(
                        F.col(column_names[idx_dest])))

        row_wise_cond = ~reduce(lambda a, b: a | b, conditions)
        return row_wise_cond
Example #7
0
    def _spark(cls, column_A, column_B, **kwargs):
        value_pairs_set = kwargs.get("value_pairs_set")

        if value_pairs_set is None:
            # vacuously true
            return column_A == column_B

        value_pairs_set = [(x, y) for x, y in value_pairs_set]
        conditions = [
            (column_A.eqNullSafe(F.lit(x)) & column_B.eqNullSafe(F.lit(y)))
            for x, y in value_pairs_set
        ]
        row_wise_cond = reduce(lambda a, b: a | b, conditions)

        return row_wise_cond
    def _spark(cls, column, **kwargs):
        def is_ascii(val):
            return str(val).isascii()

        is_ascii_udf = F.udf(is_ascii, sparktypes.BooleanType())

        return is_ascii_udf(column)
    def _spark(cls, column, strftime_format, **kwargs):
        # Below is a simple validation that the provided format can both format and parse a datetime object.
        # %D is an example of a format that can format but not parse, e.g.
        try:
            datetime.strptime(
                datetime.strftime(datetime.now(), strftime_format),
                strftime_format)
        except ValueError as e:
            raise ValueError(
                f"Unable to use provided strftime_format: {str(e)}")

        def is_parseable_by_format(val):
            if val is None:
                return False
            try:
                datetime.strptime(val, strftime_format)
                return True
            except TypeError:
                raise TypeError(
                    "Values passed to expect_column_values_to_match_strftime_format must be of type string.\nIf you want to validate a column of dates or timestamps, please call the expectation before converting from string format."
                )
            except ValueError:
                return False

        success_udf = F.udf(is_parseable_by_format, sparktypes.BooleanType())
        return success_udf(column)
    def _spark(
        cls,
        column,
        value_set,
        **kwargs,
    ):
        # no need to parse as datetime; just compare the strings as is
        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False
        )
        if parse_strings_as_datetimes:
            # deprecated-v0.13.41
            warnings.warn(
                """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \
v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \
please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/
""",
                DeprecationWarning,
            )

        if value_set is None:
            # vacuously true
            return F.lit(True)

        return column.isin(value_set)
Example #11
0
    def _spark_condition(cls, column, _metrics, threshold, double_sided, **kwargs):
        z_score, _, _ = _metrics["column_values.z_score.map"]

        if double_sided:
            threshold = abs(threshold)
            z_score = F.abs(z_score)

        return z_score < threshold
    def _spark(
        cls,
        execution_engine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        column_name = metric_domain_kwargs["column"]
        table_columns = metrics["table.column_types"]
        column_metadata = [
            col for col in table_columns if col["name"] == column_name
        ][0]

        if isinstance(column_metadata["type"], (sparktypes.StringType)):
            column = F.col(column_name).cast(sparktypes.IntegerType())
        else:
            raise TypeError(
                "Column must be a string-type capable of being cast to int.")

        compute_domain_kwargs = metric_domain_kwargs

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN)

        if any(np.array(df.select(column.isNull()).collect())):
            raise TypeError(
                "Column must be a string-type capable of being cast to int.")

        diff = column - F.lag(column).over(Window.orderBy(F.lit("constant")))
        diff = F.when(diff.isNull(), 1).otherwise(diff)

        if metric_value_kwargs["strictly"] is True:
            diff = F.when(diff <= 0, F.lit(False)).otherwise(F.lit(True))
        else:
            diff = F.when(diff < 0, F.lit(False)).otherwise(F.lit(True))

        return (
            np.array(df.select(diff).collect()).reshape(-1)[1:],
            compute_domain_kwargs,
            accessor_domain_kwargs,
        )
    def _spark(cls, column, **kwargs):
        def is_xml(val):
            try:
                xml_doc = etree.fromstring(val)
                return True
            except:
                return False

        is_xml_udf = F.udf(is_xml, sparktypes.BooleanType())

        return is_xml_udf(column)
    def _spark(cls, column, json_schema, **kwargs):
        def is_json(val):
            try:
                json.loads(val)
                return True
            except:
                return False

        is_json_udf = F.udf(is_json, sparktypes.BooleanType())

        return is_json_udf(column)
    def _spark(
        cls,
        execution_engine: SparkDFExecutionEngine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)

        column_name = accessor_domain_kwargs["column"]
        column = F.col(column_name)

        query = F.when(column == 3, F.lit(False)).otherwise(F.lit(True))

        return (query, compute_domain_kwargs, accessor_domain_kwargs)
Example #16
0
    def _spark(cls, column, **kwargs):
        center_point = kwargs.get("center_point")
        unit = kwargs.get("unit")
        range = kwargs.get("range")
        projection = kwargs.get("projection")

        if projection == "fcc":
            if unit == "kilometers":
                distances = F.udf(
                    lambda x, y=center_point: fcc_projection(x, y),
                    sparktypes.FloatType(),
                )
            elif unit == "miles":
                distances = F.udf(
                    lambda x, y=center_point: fcc_projection(x, y) * 1.609344,
                    sparktypes.FloatType(),
                )
                range = range * 1.609344

            return F.when(distances(column) < range,
                          F.lit(True)).otherwise(F.lit(False))

        elif projection == "pythagorean":
            if unit == "kilometers":
                distances = F.udf(
                    lambda x, y=center_point: pythagorean_projection(x, y),
                    sparktypes.FloatType(),
                )
            elif unit == "miles":
                distances = F.udf(
                    lambda x, y=center_point: pythagorean_projection(x, y) *
                    1.609344,
                    sparktypes.FloatType(),
                )
                range = range * 1.609344

            return F.when(distances(column) < range,
                          F.lit(True)).otherwise(F.lit(False))
Example #17
0
    def _spark(
        cls,
        execution_engine: SparkDFExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        sort = metric_value_kwargs.get("sort",
                                       cls.default_kwarg_values["sort"])
        collate = metric_value_kwargs.get("collate",
                                          cls.default_kwarg_values["collate"])

        if sort not in ["value", "count", "none"]:
            raise ValueError("sort must be either 'value', 'count', or 'none'")
        if collate is not None:
            raise ValueError(
                "collate parameter is not supported in SparkDFDataset")

        df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            metric_domain_kwargs, MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]

        value_counts = (df.select(column).where(
            F.col(column).isNotNull()).groupBy(column).count())
        if sort == "value":
            value_counts = value_counts.orderBy(column)
        elif sort == "count":
            value_counts = value_counts.orderBy(F.desc("count"))
        value_counts = value_counts.collect()
        series = pd.Series(
            [row["count"] for row in value_counts],
            index=pd.Index(data=[row[column] for row in value_counts],
                           name="value"),
            name="count",
        )
        return series
Example #18
0
    def _spark(cls, column, **kwargs):
        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False)
        if parse_strings_as_datetimes:
            warnings.warn(
                """The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \
future release.  Please update code accordingly.
""",
                DeprecationWarning,
            )

            try:
                column = apply_dateutil_parse(column=column)
            except TypeError:
                pass

        return F.max(column)
    def _spark(cls, column, **kwargs):
        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False)
        if parse_strings_as_datetimes:
            # deprecated-v0.13.41
            warnings.warn(
                """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \
v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \
please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/
""",
                DeprecationWarning,
            )

            try:
                column = apply_dateutil_parse(column=column)
            except TypeError:
                pass

        return F.min(column)
    def _spark(cls, column, json_schema, **kwargs):
        def matches_json_schema(val):
            if val is None:
                return False
            try:
                val_json = json.loads(val)
                jsonschema.validate(val_json, json_schema)
                # jsonschema.validate raises an error if validation fails.
                # So if we make it this far, we know that the validation succeeded.
                return True
            except jsonschema.ValidationError:
                return False
            except jsonschema.SchemaError:
                raise
            except:
                raise

        matches_json_schema_udf = F.udf(matches_json_schema, sparktypes.BooleanType())

        return matches_json_schema_udf(column)
Example #21
0
    def _spark(cls, column, xml_schema, **kwargs):
        try:
            xmlschema_doc = etree.fromstring(xml_schema)
            xmlschema = etree.XMLSchema(xmlschema_doc)
        except etree.ParseError:
            raise
        except:
            raise

        def matches_xml_schema(val):
            if val is None:
                return False
            try:
                xml_doc = etree.fromstring(val)
                return xmlschema(xml_doc)
            except:
                raise

        matches_xml_schema_udf = F.udf(matches_xml_schema,
                                       sparktypes.BooleanType())

        return matches_xml_schema_udf(column)
    def _spark(
        cls,
        column,
        value_set,
        **kwargs,
    ):
        # no need to parse as datetime; just compare the strings as is
        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False
        )
        if parse_strings_as_datetimes:
            warnings.warn(
                f"""The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \
future release.  Please update code accordingly.  Moreover, in "{cls.__name__}._spark()", it is not used.
""",
                DeprecationWarning,
            )

        if value_set is None:
            # vacuously true
            return F.lit(True)

        return column.isin(value_set)
Example #23
0
    def _spark(
        cls,
        execution_engine: "SparkDFExecutionEngine",
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        ignore_row_if = metric_value_kwargs["ignore_row_if"]
        compute_domain_kwargs = copy.deepcopy(metric_domain_kwargs)

        if ignore_row_if == "both_values_are_missing":
            compute_domain_kwargs["row_condition"] = (
                F.col(metric_domain_kwargs["column_A"]).isNotNull()
                & F.col(metric_domain_kwargs["column_B"]).isNotNull())
            compute_domain_kwargs["condition_parser"] = "spark"
        elif ignore_row_if == "either_value_is_missing":
            compute_domain_kwargs["row_condition"] = (
                F.col(metric_domain_kwargs["column_A"]).isNotNull()
                | F.col(metric_domain_kwargs["column_B"]).isNotNull())
            compute_domain_kwargs["condition_parser"] = "spark"

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN_PAIR)

        df = df.withColumn(
            "combined",
            F.array(
                F.col(metric_domain_kwargs["column_A"]),
                F.col(metric_domain_kwargs["column_A"]),
            ),
        )

        value_set_df = (SQLContext(df._sc).createDataFrame(
            metric_value_kwargs["value_pairs_set"], ["col_A", "col_B"]).select(
                F.array("col_A", "col_B").alias("set_AB")))

        df = df.join(value_set_df, df["combined"] == value_set_df["set_AB"],
                     "left").withColumn(
                         "__success",
                         F.when(F.col("set_AB").isNull(),
                                F.lit(False)).otherwise(F.lit(True)),
                     )
        return df["__success"], compute_domain_kwargs, accessor_domain_kwargs
Example #24
0
 def _spark_function(cls, column, **kwargs):
     return F.length(column)
 def _spark(cls, column, **kwargs: dict):  # type: ignore[no-untyped-def]
     return F.max(F.length(column))
Example #26
0
    def _spark(
        cls,
        execution_engine: SparkDFExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        parse_strings_as_datetimes: bool = (
            metric_value_kwargs.get("parse_strings_as_datetimes") or False)
        if parse_strings_as_datetimes:
            warnings.warn(
                f"""The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \
future release.  Please update code accordingly.  Moreover, in "{cls.__name__}._spark()", types are detected naturally.
""",
                DeprecationWarning,
            )

        # check if column is any type that could have na (numeric types)
        column_name = metric_domain_kwargs["column"]
        table_columns = metrics["table.column_types"]
        column_metadata = [
            col for col in table_columns if col["name"] == column_name
        ][0]
        if isinstance(
                column_metadata["type"],
            (
                sparktypes.LongType,
                sparktypes.DoubleType,
                sparktypes.IntegerType,
            ),
        ):
            # if column is any type that could have NA values, remove them (not filtered by .isNotNull())
            compute_domain_kwargs = execution_engine.add_column_row_condition(
                metric_domain_kwargs,
                filter_null=cls.filter_column_isnull,
                filter_nan=True,
            )
        else:
            compute_domain_kwargs = metric_domain_kwargs

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(compute_domain_kwargs,
                                                MetricDomainTypes.COLUMN)

        # NOTE: 20201105 - parse_strings_as_datetimes is not supported here;
        # instead detect types naturally
        column = F.col(column_name)
        if isinstance(column_metadata["type"],
                      (sparktypes.TimestampType, sparktypes.DateType)):
            diff = F.datediff(
                column,
                F.lag(column).over(Window.orderBy(F.lit("constant"))))
        else:
            diff = column - F.lag(column).over(
                Window.orderBy(F.lit("constant")))
            diff = F.when(diff.isNull(), -1).otherwise(diff)

        # NOTE: because in spark we are implementing the window function directly,
        # we have to return the *unexpected* condition
        if metric_value_kwargs["strictly"]:
            return (
                F.when(diff >= 0, F.lit(True)).otherwise(F.lit(False)),
                compute_domain_kwargs,
                accessor_domain_kwargs,
            )
        # If we expect values to be flat or decreasing then unexpected values are those
        # that are decreasing
        else:
            return (
                F.when(diff > 0, F.lit(True)).otherwise(F.lit(False)),
                compute_domain_kwargs,
                accessor_domain_kwargs,
            )
Example #27
0
    def _spark(cls, column, **kwargs):

        tz_udf = F.udf(is_valid_timezone, sparktypes.BooleanType())

        return tz_udf(column)
Example #28
0
 def _spark(cls, column, **kwargs):
     return F.count(F.lit(1)).over(Window.partitionBy(column)) <= 1
Example #29
0
 def _spark(cls, column, **kwargs):
     """Spark Standard Deviation implementation"""
     return F.stddev_samp(column)
Example #30
0
 def _spark(cls, column, _table, _column_name, **kwargs):
     """Spark Mean Implementation"""
     types = dict(_table.dtypes)
     if types[_column_name] not in ("int", "float", "double", "bigint"):
         raise TypeError("Expected numeric column type for function mean()")
     return F.mean(column)