Ejemplo n.º 1
0
def test_validate_dataset(dataset, basic_expectation_suite):
    res = ge.validate(dataset, basic_expectation_suite)
    assert res["success"] is True
    assert res["statistics"]["evaluated_expectations"] == 4
    if isinstance(dataset, ge.dataset.PandasDataset):
        res = ge.validate(dataset,
                          expectation_suite=basic_expectation_suite,  data_asset_class=ge.dataset.PandasDataset)
        assert res["success"] is True
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"):
            ge.validate(dataset, basic_expectation_suite,  data_asset_class=ge.dataset.SqlAlchemyDataset)

    elif isinstance(dataset, ge.dataset.SqlAlchemyDataset):
        res = ge.validate(dataset,
                          expectation_suite=basic_expectation_suite,  data_asset_class=ge.dataset.SqlAlchemyDataset)
        assert res["success"] is True
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"):
            ge.validate(dataset,
                        expectation_suite=basic_expectation_suite,  data_asset_class=ge.dataset.PandasDataset)

    elif isinstance(dataset, ge.dataset.SparkDFDataset):
        res = ge.validate(dataset, basic_expectation_suite, data_asset_class=ge.dataset.SparkDFDataset)
        assert res["success"] is True
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"):
            ge.validate(dataset,
                        expectation_suite=basic_expectation_suite,  data_asset_class=ge.dataset.PandasDataset)
Ejemplo n.º 2
0
def test_validate_invalid_parameters(dataset, basic_expectation_suite,
                                     data_context):
    with pytest.raises(
            ValueError,
            match=
            "Either an expectation suite or a DataContext is required for validation."
    ):
        ge.validate(dataset)
Ejemplo n.º 3
0
def test_validate_non_dataset(file_data_asset, empty_expectation_suite):
    with pytest.raises(
            ValueError,
            match=r"The validate util method only supports dataset validations"
    ):
        ge.validate(file_data_asset,
                    empty_expectation_suite,
                    data_asset_type=ge.data_asset.FileDataAsset)
Ejemplo n.º 4
0
def test_validate_non_dataset(file_data_asset, empty_expectation_suite):
    with pytest.raises(
        ValueError, match=r"The validate util method only supports dataset validations"
    ):
        with pytest.warns(
            Warning,
            match="No great_expectations version found in configuration object.",
        ):
            ge.validate(
                file_data_asset,
                empty_expectation_suite,
                data_asset_class=ge.data_asset.FileDataAsset,
            )
Ejemplo n.º 5
0
def test_validate_using_data_context(
    dataset, data_context_parameterized_expectation_suite
):
    # Before running, the data context should not have compiled parameters
    assert (
        data_context_parameterized_expectation_suite._evaluation_parameter_dependencies_compiled
        is False
    )
    with pytest.warns(
        Warning, match=r"This configuration object was built using version"
    ):
        res = ge.validate(
            dataset,
            expectation_suite_name="my_dag_node.default",
            data_context=data_context_parameterized_expectation_suite,
        )

    # Since the handling of evaluation parameters is no longer happening without an action,
    # the context should still be not compiles after validation.
    assert (
        data_context_parameterized_expectation_suite._evaluation_parameter_dependencies_compiled
        is False
    )

    # And, we should have validated the right number of expectations from the context-provided config
    assert res.success is False
    assert res.statistics["evaluated_expectations"] == 2
Ejemplo n.º 6
0
def test_validate_using_data_context_path(dataset, data_context):
    data_context_path = data_context.root_directory
    res = ge.validate(dataset,
                      expectation_suite_name="my_dag_node.default",
                      data_context=data_context_path)

    # We should have now found the right suite with expectations to evaluate
    assert res.success is False
    assert res["statistics"]["evaluated_expectations"] == 2
Ejemplo n.º 7
0
def test_validate_using_data_context_path(dataset, data_context):
    data_context_path = data_context.root_directory
    res = ge.validate(dataset,
                      data_asset_name="mydatasource/mygenerator/my_dag_node",
                      data_context=data_context_path)

    # We should have now found the right suite with expectations to evaluate
    assert res["success"] is False
    assert res["statistics"]["evaluated_expectations"] == 2
Ejemplo n.º 8
0
def test_validate_using_data_context_path(dataset, data_context):
    data_context_path = data_context.root_directory
    res = ge.validate(
        dataset,
        data_asset_name="parameterized_expectation_suite_fixture",
        data_context=data_context_path)

    # We should have now found the right suite with expectations to evaluate
    assert res["success"] == False
    assert res["statistics"]["evaluated_expectations"] == 2
Ejemplo n.º 9
0
def test_validate_using_data_context(dataset, data_context):
    # Before running, the data context should not have compiled parameters
    assert data_context._compiled is False
    res = ge.validate(dataset,
                      data_asset_name="mydatasource/mygenerator/my_dag_node",
                      data_context=data_context)

    # After handling a validation result registration, it should be
    assert data_context._compiled is True

    # And, we should have validated the right number of expectations from the context-provided config
    assert res["success"] is False
    assert res["statistics"]["evaluated_expectations"] == 2
Ejemplo n.º 10
0
def test_validate_using_data_context(dataset, data_context):
    # Before running, the data context should not have compiled parameters
    assert data_context._evaluation_parameter_dependencies_compiled is False
    res = ge.validate(dataset,
                      expectation_suite_name="my_dag_node.default",
                      data_context=data_context)

    # Since the handling of evaluation parameters is no longer happening without an action,
    # the context should still be not compiles after validation.
    assert data_context._evaluation_parameter_dependencies_compiled is False

    # And, we should have validated the right number of expectations from the context-provided config
    assert res.success is False
    assert res.statistics["evaluated_expectations"] == 2
def test_validate_using_data_context_path(
        dataset, data_context_parameterized_expectation_suite):
    data_context_path = data_context_parameterized_expectation_suite.root_directory
    with pytest.warns(
            Warning,
            match=r"This configuration object was built using version"):
        res = ge.validate(
            dataset,
            expectation_suite_name="my_dag_node.default",
            data_context=data_context_path,
        )

    # We should have now found the right suite with expectations to evaluate
    assert res.success is False
    assert res["statistics"]["evaluated_expectations"] == 2
Ejemplo n.º 12
0
    def validate(self, df: pd.DataFrame) -> "GEValidationReport":
        """
        Validate provided dataframe against GE expectation suite.
        1. Pandas dataframe is converted into PandasDataset (GE type)
        2. Some fixes applied to the data to avoid crashes inside GE (see _prepare_dataset)
        3. Each expectation from ExpectationSuite instance tested against resulting dataset

        Return GEValidationReport, which parses great expectation's schema into list of generic ValidationErrors.
        """
        dataset = PandasDataset(df)

        dataset = _prepare_dataset(dataset)

        results = ge.validate(
            dataset, expectation_suite=self.expectation_suite, result_format="COMPLETE"
        )
        return GEValidationReport(results)
def __main__():
    run_id = str(uuid.uuid1())
    if len(sys.argv) <= 1:
        print("Please specify a filepath to process.")
        sys.exit(-2)

    df = load_data(sys.argv[1])

    validation_result = ge.validate(
        df,
        data_context=ge.data_context.DataContext('../'),
        data_asset_name="notable_works_by_charles_dickens",
        run_id=run_id)

    if validation_result["success"] == False:
        print("Validation error for run {0:s}".format(str(run_id)))
        sys.exit(-1)

    df = add_columns(df)
    params = compute_model_parameters(df)

    print("processed run {run_id}.format(run_id)")
    print(json.dumps(params, indent=2))
Ejemplo n.º 14
0
def test_validate_dataset(dataset, basic_expectation_suite):
    res = ge.validate(dataset, basic_expectation_suite)
    # assert res.success is True  # will not be true for mysql, where "infinities" column is missing
    assert res["statistics"]["evaluated_expectations"] == 4
    if isinstance(dataset, ge.dataset.PandasDataset):
        res = ge.validate(
            dataset,
            expectation_suite=basic_expectation_suite,
            data_asset_class=ge.dataset.PandasDataset,
        )
        assert res.success is True
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(
                ValueError,
                match=
                r"The validate util method only supports validation for subtypes of the provided data_asset_type",
        ):
            ge.validate(
                dataset,
                basic_expectation_suite,
                data_asset_class=ge.dataset.SqlAlchemyDataset,
            )

    elif (isinstance(dataset, ge.dataset.SqlAlchemyDataset)
          and dataset.sql_engine_dialect.name.lower() != "mysql"):
        res = ge.validate(
            dataset,
            expectation_suite=basic_expectation_suite,
            data_asset_class=ge.dataset.SqlAlchemyDataset,
        )
        assert res.success is True
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(
                ValueError,
                match=
                r"The validate util method only supports validation for subtypes of the provided data_asset_type",
        ):
            ge.validate(
                dataset,
                expectation_suite=basic_expectation_suite,
                data_asset_class=ge.dataset.PandasDataset,
            )

    elif (isinstance(dataset, ge.dataset.SqlAlchemyDataset)
          and dataset.sql_engine_dialect.name.lower() == "mysql"):
        # mysql cannot use the infinities column
        res = ge.validate(
            dataset,
            expectation_suite=basic_expectation_suite,
            data_asset_class=ge.dataset.SqlAlchemyDataset,
        )
        assert res.success is False
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(
                ValueError,
                match=
                r"The validate util method only supports validation for subtypes of the provided data_asset_type",
        ):
            ge.validate(
                dataset,
                expectation_suite=basic_expectation_suite,
                data_asset_class=ge.dataset.PandasDataset,
            )

    elif isinstance(dataset, ge.dataset.SparkDFDataset):
        res = ge.validate(dataset,
                          basic_expectation_suite,
                          data_asset_class=ge.dataset.SparkDFDataset)
        assert res.success is True
        assert res["statistics"]["evaluated_expectations"] == 4
        with pytest.raises(
                ValueError,
                match=
                r"The validate util method only supports validation for subtypes of the provided data_asset_type",
        ):
            ge.validate(
                dataset,
                expectation_suite=basic_expectation_suite,
                data_asset_class=ge.dataset.PandasDataset,
            )
Ejemplo n.º 15
0
 def test_top_level_validate(self):
     my_df = pd.DataFrame({"x": [1, 2, 3, 4, 5]})
     validation_result = ge.validate(
         my_df, {
             "dataset_name":
             None,
             "meta": {
                 "great_expectations.__version__": ge.__version__
             },
             "expectations": [{
                 "expectation_type": "expect_column_to_exist",
                 "kwargs": {
                     "column": "x"
                 }
             }, {
                 "expectation_type": "expect_column_values_to_be_between",
                 "kwargs": {
                     "column": "x",
                     "min_value": 3,
                     "max_value": 5
                 }
             }]
         })
     self.assertEqual(
         validation_result, {
             "results": [{
                 "expectation_config": {
                     "kwargs": {
                         "column": "x"
                     },
                     "expectation_type": "expect_column_to_exist",
                 },
                 "exception_info": {
                     "exception_message": None,
                     "exception_traceback": None,
                     "raised_exception": False
                 },
                 "success": True
             }, {
                 "expectation_config": {
                     "expectation_type":
                     "expect_column_values_to_be_between",
                     "kwargs": {
                         "column": "x",
                         "max_value": 5,
                         "min_value": 3
                     }
                 },
                 "exception_info": {
                     "exception_message": None,
                     "exception_traceback": None,
                     "raised_exception": False
                 },
                 "success": False,
                 "result": {
                     'element_count': 5,
                     'missing_count': 0,
                     'missing_percent': 0.0,
                     "unexpected_percent": 0.4,
                     "partial_unexpected_list": [1, 2],
                     "unexpected_percent_nonmissing": 0.4,
                     "unexpected_count": 2
                 }
             }],
             "success":
             False,
             "statistics": {
                 "evaluated_expectations": 2,
                 "successful_expectations": 1,
                 "unsuccessful_expectations": 1,
                 "success_percent": 50,
             }
         })