Example #1
0
def test_get_domain_records_with_column_domain_and_filter_conditions_raises_error_on_multiple_conditions(
    sa, ):
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2, 3, 4, 5, None],
        "c": [1, 2, 3, 4, None]
    })
    engine = build_sa_engine(df, sa)
    with pytest.raises(ge_exceptions.GreatExpectationsError) as e:
        data = engine.get_domain_records(
            domain_kwargs={
                "column":
                "a",
                "row_condition":
                'col("a")<2',
                "condition_parser":
                "great_expectations__experimental__",
                "filter_conditions": [
                    RowCondition(
                        condition=f'col("b").notnull()',
                        condition_type=RowConditionParserType.GE,
                    ),
                    RowCondition(
                        condition=f'col("c").notnull()',
                        condition_type=RowConditionParserType.GE,
                    ),
                ],
            })
Example #2
0
def test_add_column_row_condition_filter_null_row_condition_not_present(
    test_execution_engine, ):
    e = test_execution_engine

    # Checking that adding a simple column row condition is functional
    # default of add_column_row_condition is to apply filter_null=True
    domain_kwargs: dict = {}
    new_domain_kwargs = e.add_column_row_condition(domain_kwargs, "a")
    assert new_domain_kwargs == {
        "filter_conditions": [
            RowCondition(condition='col("a").notnull()',
                         condition_type=RowConditionParserType.GE)
        ]
    }

    # Ensuring that this also works when formatted differently
    # default of add_column_row_condition is to apply filter_null=True
    new_domain_kwargs = e.add_column_row_condition({"column": "a"})
    assert new_domain_kwargs == {
        "column":
        "a",
        "filter_conditions": [
            RowCondition(condition='col("a").notnull()',
                         condition_type=RowConditionParserType.GE)
        ],
    }
def test_get_domain_records_with_different_column_domain_and_filter_conditions(
        spark_session, basic_spark_df_execution_engine,
        spark_df_from_pandas_df):
    pd_df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2, 3, 4, 5, None],
        "c": [1, 2, 3, 4, None]
    })
    df = spark_df_from_pandas_df(spark_session, pd_df)
    engine = basic_spark_df_execution_engine
    engine.load_batch_data(batch_id="1234", batch_data=df)
    data = engine.get_domain_records(
        domain_kwargs={
            "column":
            "a",
            "row_condition":
            'col("a")<2',
            "condition_parser":
            "great_expectations__experimental__",
            "filter_conditions": [
                RowCondition(
                    condition="b IS NOT NULL",
                    condition_type=RowConditionParserType.SPARK_SQL,
                )
            ],
        })

    expected_column_pd_df = pd_df.iloc[:1]
    expected_column_df = spark_df_from_pandas_df(spark_session,
                                                 expected_column_pd_df)

    assert dataframes_equal(
        data, expected_column_df
    ), "Data does not match after getting full access compute domain"
Example #4
0
    def _combine_row_conditions(
        self, row_conditions: List[RowCondition]
    ) -> RowCondition:
        """Combine row conditions using AND if condition_type is SPARK_SQL

        Note, although this method does not currently use `self` internally we
        are not marking as @staticmethod since it is meant to only be called
        internally in this class.

        Args:
            row_conditions: Row conditions of type Spark

        Returns:
            Single Row Condition combined
        """
        assert all(
            condition.condition_type == RowConditionParserType.SPARK_SQL
            for condition in row_conditions
        ), "All row conditions must have type SPARK_SQL"
        conditions: List[str] = [
            row_condition.condition for row_condition in row_conditions
        ]
        joined_condition: str = " AND ".join(conditions)
        return RowCondition(
            condition=joined_condition, condition_type=RowConditionParserType.SPARK_SQL
        )
Example #5
0
def test_get_domain_records_with_different_column_domain_and_filter_conditions(
        sa):
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2, 3, 4, 5, None],
        "c": [1, 2, 3, 4, None]
    })
    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column":
            "a",
            "row_condition":
            'col("a")<2',
            "condition_parser":
            "great_expectations__experimental__",
            "filter_conditions": [
                RowCondition(
                    condition=f'col("b").notnull()',
                    condition_type=RowConditionParserType.GE,
                )
            ],
        })
    domain_data = engine.engine.execute(
        get_sqlalchemy_domain_data(data)).fetchall()

    expected_column_df = df.iloc[:1]
    engine = build_sa_engine(expected_column_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"
Example #6
0
def test_add_column_row_condition_filter_null_row_condition_present(
    test_execution_engine, ):
    e = test_execution_engine

    # Ensuring that we don't override if a row condition is present
    # default of add_column_row_condition is to apply filter_null=True
    domain_kwargs: dict = {
        "column": "a",
        "row_condition": "some_row_condition"
    }
    new_domain_kwargs = e.add_column_row_condition(domain_kwargs,
                                                   filter_null=True)
    assert new_domain_kwargs == {
        "column":
        "a",
        "row_condition":
        "some_row_condition",
        "filter_conditions": [
            RowCondition(condition='col("a").notnull()',
                         condition_type=RowConditionParserType.GE)
        ],
    }

    # Ensuring that we don't override if a row condition is present,
    # default of add_column_row_condition is to apply filter_null=True
    domain_kwargs: dict = {
        "column": "a",
        "row_condition": "some_row_condition"
    }
    new_domain_kwargs = e.add_column_row_condition(domain_kwargs)
    assert new_domain_kwargs == {
        "column":
        "a",
        "row_condition":
        "some_row_condition",
        "filter_conditions": [
            RowCondition(condition='col("a").notnull()',
                         condition_type=RowConditionParserType.GE)
        ],
    }
    def add_column_row_condition(self,
                                 domain_kwargs,
                                 column_name=None,
                                 filter_null=True,
                                 filter_nan=False):
        # We explicitly handle filter_nan & filter_null for spark using a spark-native condition

        new_domain_kwargs = copy.deepcopy(domain_kwargs)
        assert "column" in domain_kwargs or column_name is not None
        if column_name is not None:
            column = column_name
        else:
            column = domain_kwargs["column"]

        filter_conditions: List[RowCondition] = []
        if filter_null:
            filter_conditions.append(
                RowCondition(
                    condition=f"{column} IS NOT NULL",
                    condition_type=RowConditionParserType.SPARK_SQL,
                ))
        if filter_nan:
            filter_conditions.append(
                RowCondition(
                    condition=f"NOT isnan({column})",
                    condition_type=RowConditionParserType.SPARK_SQL,
                ))

        if not (filter_null or filter_nan):
            logger.warning(
                "add_column_row_condition called without specifying a desired row condition"
            )

        new_domain_kwargs.setdefault("filter_conditions",
                                     []).extend(filter_conditions)

        return new_domain_kwargs
Example #8
0
def test_add_column_row_condition_filter_null_row_condition_none(
        test_execution_engine):
    e = test_execution_engine

    # Ensuring that everything still works if a row condition of None given
    # default of add_column_row_condition is to apply filter_null=True
    domain_kwargs: dict = {"column": "a", "row_condition": None}
    new_domain_kwargs = e.add_column_row_condition(domain_kwargs)
    assert new_domain_kwargs == {
        "column":
        "a",
        "row_condition":
        None,
        "filter_conditions": [
            RowCondition(condition='col("a").notnull()',
                         condition_type=RowConditionParserType.GE)
        ],
    }
    def add_column_row_condition(self,
                                 domain_kwargs,
                                 column_name=None,
                                 filter_null=True,
                                 filter_nan=False):
        """EXPERIMENTAL

        Add a row condition for handling null filter.

        Args:
            domain_kwargs: the domain kwargs to use as the base and to which to add the condition
            column_name: if provided, use this name to add the condition; otherwise, will use "column" key from table_domain_kwargs
            filter_null: if true, add a filter for null values
            filter_nan: if true, add a filter for nan values
        """
        if filter_null is False and filter_nan is False:
            logger.warning(
                "add_column_row_condition called with no filter condition requested"
            )
            return domain_kwargs

        if filter_nan:
            raise ge_exceptions.GreatExpectationsError(
                "Base ExecutionEngine does not support adding nan condition filters"
            )

        new_domain_kwargs = copy.deepcopy(domain_kwargs)
        assert (
            "column" in domain_kwargs or column_name is not None
        ), "No column provided: A column must be provided in domain_kwargs or in the column_name parameter"
        if column_name is not None:
            column = column_name
        else:
            column = domain_kwargs["column"]
        row_condition: RowCondition = RowCondition(
            condition=f'col("{column}").notnull()',
            condition_type=RowConditionParserType.GE,
        )
        new_domain_kwargs.setdefault("filter_conditions",
                                     []).append(row_condition)
        return new_domain_kwargs
def test_add_column_row_condition(spark_session,
                                  basic_spark_df_execution_engine):
    df = pd.DataFrame({"foo": [1, 2, 3, 3, None, 2, 3, 4, 5, 6]})
    df = spark_session.createDataFrame(
        [
            tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x
                  for x in record.tolist())
            for record in df.to_records(index=False)
        ],
        df.columns.tolist(),
    )
    engine = basic_spark_df_execution_engine
    engine.load_batch_data(batch_id="1234", batch_data=df)
    domain_kwargs = {"column": "foo"}

    new_domain_kwargs = engine.add_column_row_condition(domain_kwargs,
                                                        filter_null=True,
                                                        filter_nan=False)
    assert new_domain_kwargs["filter_conditions"] == [
        RowCondition(condition="foo IS NOT NULL",
                     condition_type=RowConditionParserType.SPARK_SQL)
    ]
    df, cd, ad = engine.get_compute_domain(new_domain_kwargs,
                                           domain_type="table")
    res = df.collect()
    assert res == [(1, ), (2, ), (3, ), (3, ), (2, ), (3, ), (4, ), (5, ),
                   (6, )]

    new_domain_kwargs = engine.add_column_row_condition(domain_kwargs,
                                                        filter_null=True,
                                                        filter_nan=True)
    assert new_domain_kwargs["filter_conditions"] == [
        RowCondition(condition="foo IS NOT NULL",
                     condition_type=RowConditionParserType.SPARK_SQL),
        RowCondition(condition="NOT isnan(foo)",
                     condition_type=RowConditionParserType.SPARK_SQL),
    ]
    df, cd, ad = engine.get_compute_domain(new_domain_kwargs,
                                           domain_type="table")
    res = df.collect()
    assert res == [(1, ), (2, ), (3, ), (3, ), (2, ), (3, ), (4, ), (5, ),
                   (6, )]

    new_domain_kwargs = engine.add_column_row_condition(domain_kwargs,
                                                        filter_null=False,
                                                        filter_nan=True)
    assert new_domain_kwargs["filter_conditions"] == [
        RowCondition(condition="NOT isnan(foo)",
                     condition_type=RowConditionParserType.SPARK_SQL)
    ]
    df, cd, ad = engine.get_compute_domain(new_domain_kwargs,
                                           domain_type="table")
    res = df.collect()
    assert res == [(1, ), (2, ), (3, ), (3, ), (None, ), (2, ), (3, ), (4, ),
                   (5, ), (6, )]

    # This time, our skip value *will* be nan
    df = pd.DataFrame({"foo": [1, 2, 3, 3, None, 2, 3, 4, 5, 6]})
    df = spark_session.createDataFrame(df)
    engine = basic_spark_df_execution_engine
    engine.load_batch_data(batch_id="1234", batch_data=df)

    new_domain_kwargs = engine.add_column_row_condition(domain_kwargs,
                                                        filter_null=False,
                                                        filter_nan=True)
    assert new_domain_kwargs["filter_conditions"] == [
        RowCondition(condition="NOT isnan(foo)",
                     condition_type=RowConditionParserType.SPARK_SQL)
    ]
    df, cd, ad = engine.get_compute_domain(new_domain_kwargs,
                                           domain_type="table")
    res = df.collect()
    assert res == [(1, ), (2, ), (3, ), (3, ), (2, ), (3, ), (4, ), (5, ),
                   (6, )]

    new_domain_kwargs = engine.add_column_row_condition(domain_kwargs,
                                                        filter_null=True,
                                                        filter_nan=False)
    assert new_domain_kwargs["filter_conditions"] == [
        RowCondition(condition="foo IS NOT NULL",
                     condition_type=RowConditionParserType.SPARK_SQL),
    ]
    df, cd, ad = engine.get_compute_domain(new_domain_kwargs,
                                           domain_type="table")
    res = df.collect()
    expected = [(1, ), (2, ), (3, ), (3, ), (np.nan, ), (2, ), (3, ), (4, ),
                (5, ), (6, )]
    # since nan != nan by default
    assert np.allclose(res, expected, rtol=0, atol=0, equal_nan=True)