def test_get_domain_records_with_column_domain_and_filter_conditions_raises_error_on_multiple_conditions( sa, ): df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, None], "c": [1, 2, 3, 4, None] }) engine = build_sa_engine(df, sa) with pytest.raises(ge_exceptions.GreatExpectationsError) as e: data = engine.get_domain_records( domain_kwargs={ "column": "a", "row_condition": 'col("a")<2', "condition_parser": "great_expectations__experimental__", "filter_conditions": [ RowCondition( condition=f'col("b").notnull()', condition_type=RowConditionParserType.GE, ), RowCondition( condition=f'col("c").notnull()', condition_type=RowConditionParserType.GE, ), ], })
def test_add_column_row_condition_filter_null_row_condition_not_present( test_execution_engine, ): e = test_execution_engine # Checking that adding a simple column row condition is functional # default of add_column_row_condition is to apply filter_null=True domain_kwargs: dict = {} new_domain_kwargs = e.add_column_row_condition(domain_kwargs, "a") assert new_domain_kwargs == { "filter_conditions": [ RowCondition(condition='col("a").notnull()', condition_type=RowConditionParserType.GE) ] } # Ensuring that this also works when formatted differently # default of add_column_row_condition is to apply filter_null=True new_domain_kwargs = e.add_column_row_condition({"column": "a"}) assert new_domain_kwargs == { "column": "a", "filter_conditions": [ RowCondition(condition='col("a").notnull()', condition_type=RowConditionParserType.GE) ], }
def test_get_domain_records_with_different_column_domain_and_filter_conditions( spark_session, basic_spark_df_execution_engine, spark_df_from_pandas_df): pd_df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, None], "c": [1, 2, 3, 4, None] }) df = spark_df_from_pandas_df(spark_session, pd_df) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="1234", batch_data=df) data = engine.get_domain_records( domain_kwargs={ "column": "a", "row_condition": 'col("a")<2', "condition_parser": "great_expectations__experimental__", "filter_conditions": [ RowCondition( condition="b IS NOT NULL", condition_type=RowConditionParserType.SPARK_SQL, ) ], }) expected_column_pd_df = pd_df.iloc[:1] expected_column_df = spark_df_from_pandas_df(spark_session, expected_column_pd_df) assert dataframes_equal( data, expected_column_df ), "Data does not match after getting full access compute domain"
def _combine_row_conditions( self, row_conditions: List[RowCondition] ) -> RowCondition: """Combine row conditions using AND if condition_type is SPARK_SQL Note, although this method does not currently use `self` internally we are not marking as @staticmethod since it is meant to only be called internally in this class. Args: row_conditions: Row conditions of type Spark Returns: Single Row Condition combined """ assert all( condition.condition_type == RowConditionParserType.SPARK_SQL for condition in row_conditions ), "All row conditions must have type SPARK_SQL" conditions: List[str] = [ row_condition.condition for row_condition in row_conditions ] joined_condition: str = " AND ".join(conditions) return RowCondition( condition=joined_condition, condition_type=RowConditionParserType.SPARK_SQL )
def test_get_domain_records_with_different_column_domain_and_filter_conditions( sa): df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, None], "c": [1, 2, 3, 4, None] }) engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column": "a", "row_condition": 'col("a")<2', "condition_parser": "great_expectations__experimental__", "filter_conditions": [ RowCondition( condition=f'col("b").notnull()', condition_type=RowConditionParserType.GE, ) ], }) domain_data = engine.engine.execute( get_sqlalchemy_domain_data(data)).fetchall() expected_column_df = df.iloc[:1] engine = build_sa_engine(expected_column_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain"
def test_add_column_row_condition_filter_null_row_condition_present( test_execution_engine, ): e = test_execution_engine # Ensuring that we don't override if a row condition is present # default of add_column_row_condition is to apply filter_null=True domain_kwargs: dict = { "column": "a", "row_condition": "some_row_condition" } new_domain_kwargs = e.add_column_row_condition(domain_kwargs, filter_null=True) assert new_domain_kwargs == { "column": "a", "row_condition": "some_row_condition", "filter_conditions": [ RowCondition(condition='col("a").notnull()', condition_type=RowConditionParserType.GE) ], } # Ensuring that we don't override if a row condition is present, # default of add_column_row_condition is to apply filter_null=True domain_kwargs: dict = { "column": "a", "row_condition": "some_row_condition" } new_domain_kwargs = e.add_column_row_condition(domain_kwargs) assert new_domain_kwargs == { "column": "a", "row_condition": "some_row_condition", "filter_conditions": [ RowCondition(condition='col("a").notnull()', condition_type=RowConditionParserType.GE) ], }
def add_column_row_condition(self, domain_kwargs, column_name=None, filter_null=True, filter_nan=False): # We explicitly handle filter_nan & filter_null for spark using a spark-native condition new_domain_kwargs = copy.deepcopy(domain_kwargs) assert "column" in domain_kwargs or column_name is not None if column_name is not None: column = column_name else: column = domain_kwargs["column"] filter_conditions: List[RowCondition] = [] if filter_null: filter_conditions.append( RowCondition( condition=f"{column} IS NOT NULL", condition_type=RowConditionParserType.SPARK_SQL, )) if filter_nan: filter_conditions.append( RowCondition( condition=f"NOT isnan({column})", condition_type=RowConditionParserType.SPARK_SQL, )) if not (filter_null or filter_nan): logger.warning( "add_column_row_condition called without specifying a desired row condition" ) new_domain_kwargs.setdefault("filter_conditions", []).extend(filter_conditions) return new_domain_kwargs
def test_add_column_row_condition_filter_null_row_condition_none( test_execution_engine): e = test_execution_engine # Ensuring that everything still works if a row condition of None given # default of add_column_row_condition is to apply filter_null=True domain_kwargs: dict = {"column": "a", "row_condition": None} new_domain_kwargs = e.add_column_row_condition(domain_kwargs) assert new_domain_kwargs == { "column": "a", "row_condition": None, "filter_conditions": [ RowCondition(condition='col("a").notnull()', condition_type=RowConditionParserType.GE) ], }
def add_column_row_condition(self, domain_kwargs, column_name=None, filter_null=True, filter_nan=False): """EXPERIMENTAL Add a row condition for handling null filter. Args: domain_kwargs: the domain kwargs to use as the base and to which to add the condition column_name: if provided, use this name to add the condition; otherwise, will use "column" key from table_domain_kwargs filter_null: if true, add a filter for null values filter_nan: if true, add a filter for nan values """ if filter_null is False and filter_nan is False: logger.warning( "add_column_row_condition called with no filter condition requested" ) return domain_kwargs if filter_nan: raise ge_exceptions.GreatExpectationsError( "Base ExecutionEngine does not support adding nan condition filters" ) new_domain_kwargs = copy.deepcopy(domain_kwargs) assert ( "column" in domain_kwargs or column_name is not None ), "No column provided: A column must be provided in domain_kwargs or in the column_name parameter" if column_name is not None: column = column_name else: column = domain_kwargs["column"] row_condition: RowCondition = RowCondition( condition=f'col("{column}").notnull()', condition_type=RowConditionParserType.GE, ) new_domain_kwargs.setdefault("filter_conditions", []).append(row_condition) return new_domain_kwargs
def test_add_column_row_condition(spark_session, basic_spark_df_execution_engine): df = pd.DataFrame({"foo": [1, 2, 3, 3, None, 2, 3, 4, 5, 6]}) df = spark_session.createDataFrame( [ tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist()) for record in df.to_records(index=False) ], df.columns.tolist(), ) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="1234", batch_data=df) domain_kwargs = {"column": "foo"} new_domain_kwargs = engine.add_column_row_condition(domain_kwargs, filter_null=True, filter_nan=False) assert new_domain_kwargs["filter_conditions"] == [ RowCondition(condition="foo IS NOT NULL", condition_type=RowConditionParserType.SPARK_SQL) ] df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1, ), (2, ), (3, ), (3, ), (2, ), (3, ), (4, ), (5, ), (6, )] new_domain_kwargs = engine.add_column_row_condition(domain_kwargs, filter_null=True, filter_nan=True) assert new_domain_kwargs["filter_conditions"] == [ RowCondition(condition="foo IS NOT NULL", condition_type=RowConditionParserType.SPARK_SQL), RowCondition(condition="NOT isnan(foo)", condition_type=RowConditionParserType.SPARK_SQL), ] df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1, ), (2, ), (3, ), (3, ), (2, ), (3, ), (4, ), (5, ), (6, )] new_domain_kwargs = engine.add_column_row_condition(domain_kwargs, filter_null=False, filter_nan=True) assert new_domain_kwargs["filter_conditions"] == [ RowCondition(condition="NOT isnan(foo)", condition_type=RowConditionParserType.SPARK_SQL) ] df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1, ), (2, ), (3, ), (3, ), (None, ), (2, ), (3, ), (4, ), (5, ), (6, )] # This time, our skip value *will* be nan df = pd.DataFrame({"foo": [1, 2, 3, 3, None, 2, 3, 4, 5, 6]}) df = spark_session.createDataFrame(df) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="1234", batch_data=df) new_domain_kwargs = engine.add_column_row_condition(domain_kwargs, filter_null=False, filter_nan=True) assert new_domain_kwargs["filter_conditions"] == [ RowCondition(condition="NOT isnan(foo)", condition_type=RowConditionParserType.SPARK_SQL) ] df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() assert res == [(1, ), (2, ), (3, ), (3, ), (2, ), (3, ), (4, ), (5, ), (6, )] new_domain_kwargs = engine.add_column_row_condition(domain_kwargs, filter_null=True, filter_nan=False) assert new_domain_kwargs["filter_conditions"] == [ RowCondition(condition="foo IS NOT NULL", condition_type=RowConditionParserType.SPARK_SQL), ] df, cd, ad = engine.get_compute_domain(new_domain_kwargs, domain_type="table") res = df.collect() expected = [(1, ), (2, ), (3, ), (3, ), (np.nan, ), (2, ), (3, ), (4, ), (5, ), (6, )] # since nan != nan by default assert np.allclose(res, expected, rtol=0, atol=0, equal_nan=True)