def _spark( cls, execution_engine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance(column_metadata["type"], (sparktypes.StringType)): column = F.col(column_name).cast(sparktypes.IntegerType()) else: raise TypeError( "Column must be a string-type capable of being cast to int.") compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) if any(np.array(df.select(column.isNull()).collect())): raise TypeError( "Column must be a string-type capable of being cast to int.") diff = column - F.lag(column).over(Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), 1).otherwise(diff) if metric_value_kwargs["strictly"] is True: diff = F.when(diff <= 0, F.lit(False)).otherwise(F.lit(True)) else: diff = F.when(diff < 0, F.lit(False)).otherwise(F.lit(True)) return ( np.array(df.select(diff).collect()).reshape(-1)[1:], compute_domain_kwargs, accessor_domain_kwargs, )
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): parse_strings_as_datetimes: bool = ( metric_value_kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: # deprecated-v0.13.41 warnings.warn( """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \ v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \ please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/ """, DeprecationWarning, ) # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), 1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition. # If we expect values to be *strictly* increasing then unexpected values are those # that are flat or decreasing if metric_value_kwargs["strictly"] is True: return ( F.when(diff <= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or increasing then unexpected values are those # that are decreasing else: return ( F.when(diff < 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): parse_strings_as_datetimes: bool = ( metric_value_kwargs.get("parse_strings_as_datetimes") or False) if parse_strings_as_datetimes: warnings.warn( f"""The parameter "parse_strings_as_datetimes" is no longer supported and will be deprecated in a \ future release. Please update code accordingly. Moreover, in "{cls.__name__}._spark()", types are detected naturally. """, DeprecationWarning, ) # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(compute_domain_kwargs, MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), -1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition if metric_value_kwargs["strictly"]: return ( F.when(diff >= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or decreasing then unexpected values are those # that are decreasing else: return ( F.when(diff > 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): # check if column is any type that could have na (numeric types) column_name = metric_domain_kwargs["column"] table_columns = metrics["table.column_types"] column_metadata = [ col for col in table_columns if col["name"] == column_name ][0] if isinstance( column_metadata["type"], ( sparktypes.LongType, sparktypes.DoubleType, sparktypes.IntegerType, ), ): # if column is any type that could have NA values, remove them (not filtered by .isNotNull()) compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs, filter_null=cls.filter_column_isnull, filter_nan=True, ) else: compute_domain_kwargs = metric_domain_kwargs ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) # NOTE: 20201105 - parse_strings_as_datetimes is not supported here; # instead detect types naturally column = F.col(column_name) if isinstance(column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType)): diff = F.datediff( column, F.lag(column).over(Window.orderBy(F.lit("constant")))) else: diff = column - F.lag(column).over( Window.orderBy(F.lit("constant"))) diff = F.when(diff.isNull(), 1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, # we have to return the *unexpected* condition. # If we expect values to be *strictly* increasing then unexpected values are those # that are flat or decreasing if metric_value_kwargs["strictly"] is True: return ( F.when(diff <= 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, ) # If we expect values to be flat or increasing then unexpected values are those # that are decreasing else: return ( F.when(diff < 0, F.lit(True)).otherwise(F.lit(False)), compute_domain_kwargs, accessor_domain_kwargs, )