def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: dict, metric_value_kwargs: dict, metrics: Dict[str, Any], runtime_configuration: dict, ) -> List[pyspark_sql_Row]: query: Optional[str] = metric_value_kwargs.get( "query" ) or cls.default_kwarg_values.get("query") df: pyspark_sql_DataFrame df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE ) df.createOrReplaceTempView("tmp_view") column: str = metric_value_kwargs.get("column") query = query.format(col=column, active_batch="tmp_view") engine: pyspark_sql_SparkSession = execution_engine.spark result: List[pyspark_sql_Row] = engine.sql(query).collect() return result
def test_get_compute_domain_with_unmeetable_row_condition(spark_session): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) expected_df = df.filter(F.col("b") > 24) engine = SparkDFExecutionEngine() engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"row_condition": "b > 24", "condition_parser": "spark"}, domain_type=MetricDomainTypes.TABLE, ) # Ensuring data has been properly queried assert data.schema == expected_df.schema assert data.collect() == expected_df.collect() # Ensuring compute kwargs have not been modified assert "row_condition" in compute_kwargs.keys() assert accessor_kwargs == {}
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = df[accessor_domain_kwargs["column"]] if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") if min_value is None: if strict_max: condition = column < max_value else: condition = column <= max_value elif max_value is None: if strict_min: condition = column > min_value else: condition = column >= min_value else: if strict_min and strict_max: condition = (column > min_value) & (column < max_value) elif strict_min: condition = (column > min_value) & (column <= max_value) elif strict_max: condition = (column >= min_value) & (column < max_value) else: condition = (column >= min_value) & (column <= max_value) return df.filter(condition).count()
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) return _get_spark_column_metadata( df.schema, include_nested=metric_value_kwargs["include_nested"])
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) if metric_value_kwargs["fetch_all"]: return df.collect() return df.head(metric_value_kwargs["n_rows"])
def test_get_compute_domain_with_column_domain(spark_session): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist()) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) engine = SparkDFExecutionEngine() engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN) assert compute_kwargs is not None, "Compute domain kwargs should be existent" assert accessor_kwargs == {"column": "a"} assert data.schema == df.schema assert data.collect() == df.collect()
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column_name = accessor_domain_kwargs["column"] column = F.col(column_name) query = F.when(column == 3, F.lit(False)).otherwise(F.lit(True)) return (query, compute_domain_kwargs, accessor_domain_kwargs)
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): sort = metric_value_kwargs.get("sort", cls.default_kwarg_values["sort"]) collate = metric_value_kwargs.get("collate", cls.default_kwarg_values["collate"]) if sort not in ["value", "count", "none"]: raise ValueError("sort must be either 'value', 'count', or 'none'") if collate is not None: raise ValueError( "collate parameter is not supported in SparkDFDataset") df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( metric_domain_kwargs, MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] value_counts = (df.select(column).where( F.col(column).isNotNull()).groupBy(column).count()) if sort == "value": value_counts = value_counts.orderBy(column) elif sort == "count": value_counts = value_counts.orderBy(F.desc("count")) value_counts = value_counts.collect() series = pd.Series( [row["count"] for row in value_counts], index=pd.Index(data=[row[column] for row in value_counts], name="value"), name="count", ) return series
def test_add_column_row_condition(spark_session): df = pd.DataFrame({"foo": [1, 2, 3, 3, None, 2, 3, 4, 5, 6]}) df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in df.to_records(index=False) ], df.columns.tolist(), ) engine = SparkDFExecutionEngine(batch_data_dict={tuple(): df}) domain_kwargs = {"column": "foo"} new_domain_kwargs = engine.add_column_row_condition( domain_kwargs, filter_null=True, filter_nan=False ) assert new_domain_kwargs["row_condition"] == 'col("foo").notnull()' df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1,), (2,), (3,), (3,), (2,), (3,), (4,), (5,), (6,)] new_domain_kwargs = engine.add_column_row_condition( domain_kwargs, filter_null=True, filter_nan=True ) assert new_domain_kwargs["row_condition"] == "NOT isnan(foo) AND foo IS NOT NULL" df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1,), (2,), (3,), (3,), (2,), (3,), (4,), (5,), (6,)] new_domain_kwargs = engine.add_column_row_condition( domain_kwargs, filter_null=False, filter_nan=True ) assert new_domain_kwargs["row_condition"] == "NOT isnan(foo)" df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1,), (2,), (3,), (3,), (None,), (2,), (3,), (4,), (5,), (6,)] # This time, our skip value *will* be nan df = pd.DataFrame({"foo": [1, 2, 3, 3, None, 2, 3, 4, 5, 6]}) df = spark_session.createDataFrame(df) engine = SparkDFExecutionEngine(batch_data_dict={tuple(): df}) new_domain_kwargs = engine.add_column_row_condition( domain_kwargs, filter_null=False, filter_nan=True ) assert new_domain_kwargs["row_condition"] == "NOT isnan(foo)" df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1,), (2,), (3,), (3,), (2,), (3,), (4,), (5,), (6,)] new_domain_kwargs = engine.add_column_row_condition( domain_kwargs, filter_null=True, filter_nan=False ) assert new_domain_kwargs["row_condition"] == 'col("foo").notnull()' df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() expected = [(1,), (2,), (3,), (3,), (np.nan,), (2,), (3,), (4,), (5,), (6,)] # since nan != nan by default assert np.allclose(res, expected, rtol=0, atol=0, equal_nan=True)
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): parse_strings_as_datetimes: bool = ( metric_value_kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: warnings.warn( f"""The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \ future release. Please update code accordingly. Moreover, in "{cls.__name__}._spark()", types are detected naturally. """, DeprecationWarning, ) # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(compute_domain_kwargs, MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), -1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition if metric_value_kwargs["strictly"]: return ( F.when(diff >= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or decreasing then unexpected values are those # that are decreasing else: return ( F.when(diff > 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) bins = metric_value_kwargs["bins"] column = metric_domain_kwargs["column"] """return a list of counts corresponding to bins""" bins = list(copy.deepcopy( bins)) # take a copy since we are inserting and popping if bins[0] == -np.inf or bins[0] == -float("inf"): added_min = False bins[0] = -float("inf") else: added_min = True bins.insert(0, -float("inf")) if bins[-1] == np.inf or bins[-1] == float("inf"): added_max = False bins[-1] = float("inf") else: added_max = True bins.append(float("inf")) temp_column = df.select(column).where(F.col(column).isNotNull()) bucketizer = Bucketizer(splits=bins, inputCol=column, outputCol="buckets") bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column) # This is painful to do, but: bucketizer cannot handle values outside of a range # (hence adding -/+ infinity above) # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound # for all but the last bin # But, since the last bin in our case will often be +infinity, we need to # find the number of values exactly equal to the upper bound to add those # We'll try for an optimization by asking for it at the same time if added_max: upper_bound_count = (temp_column.select(column).filter( F.col(column) == bins[-2]).count()) else: upper_bound_count = 0 hist_rows = bucketed.groupBy("buckets").count().collect() # Spark only returns buckets that have nonzero counts. hist = [0] * (len(bins) - 1) for row in hist_rows: hist[int(row["buckets"])] = row["count"] hist[-2] += upper_bound_count if added_min: below_bins = hist.pop(0) bins.pop(0) if below_bins > 0: logger.warning("Discarding histogram values below lowest bin.") if added_max: above_bins = hist.pop(-1) bins.pop(-1) if above_bins > 0: logger.warning( "Discarding histogram values above highest bin.") return hist
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): parse_strings_as_datetimes: bool = ( metric_value_kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: # deprecated-v0.13.41 warnings.warn( """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \ v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \ please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/ """, DeprecationWarning, ) # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), 1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition. # If we expect values to be *strictly* increasing then unexpected values are those # that are flat or decreasing if metric_value_kwargs["strictly"] is True: return ( F.when(diff <= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or increasing then unexpected values are those # that are decreasing else: return ( F.when(diff < 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), 1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition. # If we expect values to be *strictly* increasing then unexpected values are those # that are flat or decreasing if metric_value_kwargs["strictly"] is True: return ( F.when(diff <= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or increasing then unexpected values are those # that are decreasing else: return ( F.when(diff < 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )