Ejemplo n.º 1
0
def test_ValidationsStore_with_FixedLengthTupleS3StoreBackend():
    bucket = "test_validation_store_bucket"
    prefix = "test/prefix"

    # create a bucket in Moto's mock AWS environment
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=bucket)

    # First, demonstrate that we pick up default configuration including from an S3FixedLengthTupleS3StoreBackend
    my_store = ValidationsStore(store_backend={
        "class_name": "FixedLengthTupleS3StoreBackend",
        "bucket": bucket,
        "prefix": prefix
    },
                                root_directory=None)

    with pytest.raises(TypeError):
        my_store.get("not_a_ValidationResultIdentifier")

    with pytest.raises(MissingTopLevelConfigKeyError):
        my_store.get(ValidationResultIdentifier(**{}))

    ns_1 = ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            data_asset_name=DataAssetIdentifier(datasource="a",
                                                generator="b",
                                                generator_asset="c"),
            expectation_suite_name="quarantine",
        ),
        run_id="20191007T151224.1234Z_prod_100")
    my_store.set(ns_1, {"A": "aaa"})
    assert my_store.get(ns_1) == {"A": "aaa"}

    ns_2 = ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            data_asset_name=DataAssetIdentifier(datasource="a",
                                                generator="b",
                                                generator_asset="c"),
            expectation_suite_name="quarantine",
        ),
        run_id="20191007T151224.1234Z_prod_200")

    my_store.set(ns_2, "bbb")
    assert my_store.get(ns_2) == "bbb"

    # Verify that internals are working as expected, including the default filepath
    assert set([
        s3_object_info['Key'] for s3_object_info in boto3.client(
            's3').list_objects(Bucket=bucket, Prefix=prefix)['Contents']
    ]) == {
        'test/prefix/20191007T151224.1234Z_prod_100/a/b/c/quarantine.json',
        'test/prefix/20191007T151224.1234Z_prod_200/a/b/c/quarantine.json'
    }

    print(my_store.list_keys())
    assert set(my_store.list_keys()) == {
        ns_1,
        ns_2,
    }
Ejemplo n.º 2
0
def test_OrderedKeysDotDict__recursively_get_key_length():
    class MyOKDD(OrderedKeysDotDict):
        _key_order = ["A", "B", "C"]
        _key_types = {
            "A": string_types,
            "B": int,
        }
        _allowed_keys = set(_key_order)
        _required_keys = set(_key_order)

    assert MyOKDD._recursively_get_key_length() == 3
    assert DataAssetIdentifier._recursively_get_key_length() == 3
    assert ValidationResultIdentifier._recursively_get_key_length() == 5
def test_compile(data_context):
    data_context._compile()
    assert data_context._compiled_parameters == {
        'raw': {
            'urn:great_expectations:validations:mydatasource/mygenerator/source_diabetes_data:default:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value',
            'urn:great_expectations:validations:mydatasource/mygenerator/source_patient_data:default:expectations:expect_table_row_count_to_equal:result:observed_value'
        },
        'data_assets': {
            DataAssetIdentifier(datasource='mydatasource',
                                generator='mygenerator',
                                generator_asset='source_diabetes_data'): {
                'default': {
                    'expect_column_unique_value_count_to_be_between': {
                        'columns': {
                            'patient_nbr': {
                                'result': {
                                    'urn:great_expectations:validations:mydatasource/mygenerator/source_diabetes_data:default:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value'
                                }
                            }
                        }
                    }
                }
            },
            DataAssetIdentifier(datasource='mydatasource',
                                generator='mygenerator',
                                generator_asset='source_patient_data'): {
                'default': {
                    'expect_table_row_count_to_equal': {
                        'result': {
                            'urn:great_expectations:validations:mydatasource/mygenerator/source_patient_data:default:expectations:expect_table_row_count_to_equal:result:observed_value'
                        }
                    }
                }
            }
        }
    }
Ejemplo n.º 4
0
def test_HtmlSiteStore_S3_backend():
    bucket = "test_validation_store_bucket"
    prefix = "test/prefix"

    # create a bucket in Moto's mock AWS environment
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=bucket)

    my_store = HtmlSiteStore(root_directory='NOT_USED_WITH_S3',
                             store_backend={
                                 "class_name":
                                 "FixedLengthTupleS3StoreBackend",
                                 "bucket": bucket,
                                 "prefix": prefix
                             })

    with pytest.raises(TypeError):
        my_store.get("not_a_ValidationResultIdentifier")

    with pytest.raises(MissingTopLevelConfigKeyError):
        my_store.get(ValidationResultIdentifier(**{}))

    ns_1 = SiteSectionIdentifier(
        site_section_name="validations",
        resource_identifier=ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                data_asset_name=DataAssetIdentifier(datasource="a",
                                                    generator="b",
                                                    generator_asset="c"),
                expectation_suite_name="quarantine",
            ),
            run_id="20191007T151224.1234Z_prod_100"))
    my_store.set(ns_1, "aaa")

    ns_2 = SiteSectionIdentifier(
        site_section_name="expectations",
        resource_identifier=ExpectationSuiteIdentifier(
            data_asset_name=DataAssetIdentifier(datasource="a",
                                                generator="b",
                                                generator_asset="c"),
            expectation_suite_name="quarantine",
        ))
    my_store.set(ns_2, "bbb")

    assert set(my_store.list_keys()) == {
        ns_1,
        ns_2,
    }

    # This is a special un-store-like method exposed by the HtmlSiteStore
    my_store.write_index_page("index_html_string_content")

    # Verify that internals are working as expected, including the default filepath
    assert set([
        s3_object_info['Key'] for s3_object_info in boto3.client(
            's3').list_objects(Bucket=bucket, Prefix=prefix)['Contents']
    ]) == {
        'test/prefix/index.html',
        'test/prefix/expectations/a/b/c/quarantine.html',
        'test/prefix/validations/20191007T151224.1234Z_prod_100/a/b/c/quarantine.html'
    }

    index_content = boto3.client('s3').get_object(Bucket=bucket, Key='test/prefix/index.html')["Body"]\
        .read().decode('utf-8')
    assert index_content == "index_html_string_content"
def test_data_context_updates_expectation_suite_names(data_context):
    # A data context should update the data_asset_name and expectation_suite_name of expectation suites
    # that it creates when it saves them.

    expectation_suites = data_context.list_expectation_suite_keys()

    # We should have a single expectation suite defined
    assert len(expectation_suites) == 1

    data_asset_name = expectation_suites[0]['data_asset_name']
    expectation_suite_name = expectation_suites[0]['expectation_suite_name']

    # We'll get that expectation suite and then update its name and re-save, then verify that everything
    # has been properly updated
    expectation_suite = data_context.get_expectation_suite(
        data_asset_name=data_asset_name,
        expectation_suite_name=expectation_suite_name)

    # Note we codify here the current behavior of having a string data_asset_name though typed ExpectationSuite objects
    # will enable changing that
    assert expectation_suite['data_asset_name'] == str(data_asset_name)
    assert expectation_suite[
        'expectation_suite_name'] == expectation_suite_name

    # We will now change the data_asset_name and then save the suite in three ways:
    #   1. Directly using the new name,
    #   2. Using a different name that should be overwritten
    #   3. Using the new name but having the context draw that from the suite

    # Finally, we will try to save without a name (deleting it first) to demonstrate that saving will fail.
    expectation_suite['data_asset_name'] = str(
        DataAssetIdentifier(data_asset_name.datasource,
                            data_asset_name.generator, "a_new_data_asset"))
    expectation_suite['expectation_suite_name'] = 'a_new_suite_name'

    data_context.save_expectation_suite(
        expectation_suite=expectation_suite,
        data_asset_name=DataAssetIdentifier(data_asset_name.datasource,
                                            data_asset_name.generator,
                                            "a_new_data_asset"),
        expectation_suite_name='a_new_suite_name')

    fetched_expectation_suite = data_context.get_expectation_suite(
        data_asset_name=DataAssetIdentifier(data_asset_name.datasource,
                                            data_asset_name.generator,
                                            "a_new_data_asset"),
        expectation_suite_name='a_new_suite_name')

    assert fetched_expectation_suite['data_asset_name'] == str(
        DataAssetIdentifier(data_asset_name.datasource,
                            data_asset_name.generator, "a_new_data_asset"))
    assert fetched_expectation_suite[
        'expectation_suite_name'] == 'a_new_suite_name'

    #   2. Using a different name that should be overwritten
    data_context.save_expectation_suite(
        expectation_suite=expectation_suite,
        data_asset_name=DataAssetIdentifier(data_asset_name.datasource,
                                            data_asset_name.generator,
                                            "a_new_new_data_asset"),
        expectation_suite_name='a_new_new_suite_name')

    fetched_expectation_suite = data_context.get_expectation_suite(
        data_asset_name=DataAssetIdentifier(data_asset_name.datasource,
                                            data_asset_name.generator,
                                            "a_new_new_data_asset"),
        expectation_suite_name='a_new_new_suite_name')

    assert fetched_expectation_suite['data_asset_name'] == str(
        DataAssetIdentifier(data_asset_name.datasource,
                            data_asset_name.generator, "a_new_new_data_asset"))
    assert fetched_expectation_suite[
        'expectation_suite_name'] == 'a_new_new_suite_name'

    # Check that the saved name difference is actually persisted on disk
    with open(
            os.path.join(data_context.root_directory, "expectations",
                         data_asset_name.datasource, data_asset_name.generator,
                         "a_new_new_data_asset", "a_new_new_suite_name.json"),
            'r') as suite_file:
        loaded_suite = json.load(suite_file)
        assert loaded_suite['data_asset_name'] == str(
            DataAssetIdentifier(data_asset_name.datasource,
                                data_asset_name.generator,
                                "a_new_new_data_asset"))
        assert loaded_suite['expectation_suite_name'] == 'a_new_new_suite_name'

    #   3. Using the new name but having the context draw that from the suite
    expectation_suite['data_asset_name'] = str(
        DataAssetIdentifier(data_asset_name.datasource,
                            data_asset_name.generator, "a_third_name"))
    expectation_suite['expectation_suite_name'] = "a_third_suite_name"
    data_context.save_expectation_suite(expectation_suite=expectation_suite)

    fetched_expectation_suite = data_context.get_expectation_suite(
        data_asset_name=DataAssetIdentifier(data_asset_name.datasource,
                                            data_asset_name.generator,
                                            "a_third_name"),
        expectation_suite_name="a_third_suite_name")
    assert fetched_expectation_suite['data_asset_name'] == str(
        DataAssetIdentifier(data_asset_name.datasource,
                            data_asset_name.generator, "a_third_name"))
    assert fetched_expectation_suite[
        'expectation_suite_name'] == "a_third_suite_name"
Ejemplo n.º 6
0
def test_errors_warnings_validation_operator_run_slack_query(
        basic_data_context_config_for_validation_operator, tmp_path_factory,
        filesystem_csv_4):
    #####
    #####
    #
    # WARNING: PY2 SUPPORT IS UNTESTED BECAUSE OF DICTIONARY ORDER ISSUES NOT YET RESOLVED
    #
    #####
    #####
    if PY2:
        pytest.skip(
            "skipping test_errors_warnings_validation_operator_run_slack_query in py2"
        )

    project_path = str(tmp_path_factory.mktemp('great_expectations'))

    # NOTE: This setup is almost identical to test_DefaultDataContextAwareValidationOperator.
    # Consider converting to a single fixture.

    data_context = ConfigOnlyDataContext(
        basic_data_context_config_for_validation_operator,
        project_path,
    )

    data_context.add_datasource("my_datasource",
                                class_name="PandasDatasource",
                                base_directory=str(filesystem_csv_4))

    data_context.create_expectation_suite(
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="failure")
    df = data_context.get_batch("my_datasource/default/f1",
                                "failure",
                                batch_kwargs=data_context.yield_batch_kwargs(
                                    "my_datasource/default/f1"))
    df.expect_column_values_to_be_between(column="x", min_value=1, max_value=9)
    failure_expectations = df.get_expectation_suite(
        discard_failed_expectations=False)
    data_context.save_expectation_suite(
        failure_expectations,
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="failure")

    data_context.create_expectation_suite(
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="warning")
    df = data_context.get_batch("my_datasource/default/f1",
                                "warning",
                                batch_kwargs=data_context.yield_batch_kwargs(
                                    "my_datasource/default/f1"))
    df.expect_column_values_to_be_between(column="x", min_value=1, max_value=9)
    df.expect_column_values_to_not_be_null(column="y")
    warning_expectations = df.get_expectation_suite(
        discard_failed_expectations=False)
    data_context.save_expectation_suite(
        warning_expectations,
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="warning")

    data_context.save_expectation_suite(
        failure_expectations,
        data_asset_name="my_datasource/default/f2",
        expectation_suite_name="failure")
    data_context.save_expectation_suite(
        failure_expectations,
        data_asset_name="my_datasource/default/f3",
        expectation_suite_name="failure")
    data_context.save_expectation_suite(
        warning_expectations,
        data_asset_name="my_datasource/default/f2",
        expectation_suite_name="warning")
    data_context.save_expectation_suite(
        warning_expectations,
        data_asset_name="my_datasource/default/f3",
        expectation_suite_name="warning")

    vo = WarningAndFailureExpectationSuitesValidationOperator(
        data_context=data_context,
        action_list=[],
        slack_webhook="https://hooks.slack.com/services/test/slack/webhook")

    my_df_1 = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 4, None]})
    my_ge_df_1 = ge.from_pandas(my_df_1)
    my_ge_df_1._expectation_suite["data_asset_name"] = DataAssetIdentifier(
        "my_datasource", "default", "f1")

    my_df_2 = pd.DataFrame({"x": [1, 2, 3, 4, 99], "y": [1, 2, 3, 4, 5]})
    my_ge_df_2 = ge.from_pandas(my_df_2)
    my_ge_df_2._expectation_suite["data_asset_name"] = DataAssetIdentifier(
        "my_datasource", "default", "f2")

    my_df_3 = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 4, 5]})
    my_ge_df_3 = ge.from_pandas(my_df_3)
    my_ge_df_3._expectation_suite["data_asset_name"] = DataAssetIdentifier(
        "my_datasource", "default", "f3")

    return_obj = vo.run(
        assets_to_validate=[my_ge_df_1, my_ge_df_2, my_ge_df_3],
        run_id="test_100")
    slack_query = vo._build_slack_query(return_obj)
    expected_slack_query = {
        'blocks': [{
            'type': 'divider'
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*FailureVsWarning Validation Operator Completed.*'
            }
        }, {
            'type': 'divider'
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*Status*: Failed :x:'
            }
        }, {
            'type': 'section',
            'text': {
                'type':
                'mrkdwn',
                'text':
                '*Data Asset List:* [my_datasource/default/f1, my_datasource/default/f2, my_datasource/default/f3]'
            }
        }, {
            'type': 'section',
            'text': {
                ''
                'type': 'mrkdwn',
                'text': '*Failed Data Assets:* [my_datasource/default/f2]'
            }
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*Run ID:* test_100'
            }
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*Timestamp:* 09/26/2019 13:42:41'
            }
        }, {
            'type': 'divider'
        }, {
            'type':
            'context',
            'elements': [{
                'type':
                'mrkdwn',
                'text':
                'Learn about FailureVsWarning Validation Operators at https://docs.greatexpectations.io/en/latest/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.html'
            }]
        }]
    }

    # We're okay with system variation in locales (OS X likes 24 hour, but not Travis)
    slack_query['blocks'][7]['text']['text'] = \
        slack_query['blocks'][7]['text']['text'].replace('09/26/2019 13:42:41', 'LOCALEDATE')
    slack_query['blocks'][7]['text']['text'] = \
        slack_query['blocks'][7]['text']['text'].replace('09/26/2019 01:42:41 PM', 'LOCALEDATE')
    expected_slack_query['blocks'][7]['text']['text'] = \
        expected_slack_query['blocks'][7]['text']['text'].replace('09/26/2019 13:42:41', 'LOCALEDATE')
    expected_slack_query['blocks'][7]['text']['text'] = \
        expected_slack_query['blocks'][7]['text']['text'].replace('09/26/2019 01:42:41 PM', 'LOCALEDATE')

    import json
    print(json.dumps(slack_query, indent=2))
    print(json.dumps(expected_slack_query, indent=2))
    assert slack_query == expected_slack_query
Ejemplo n.º 7
0
def test_WarningAndFailureExpectationSuitesValidationOperator_with_file_structure(tmp_path_factory):
    base_path = str(tmp_path_factory.mktemp('test_DefaultDataContextAwareValidationOperator_with_file_structure__dir'))
    project_path = os.path.join( base_path, "project")
    print(os.getcwd())
    shutil.copytree(
        os.path.join( os.getcwd(), "tests/data_context/fixtures/post_init_project_v0.8.0_A" ),
        project_path,
    )
    print(gen_directory_tree_str(project_path))

    assert gen_directory_tree_str(project_path) == """\
project/
    data/
        bob-ross/
            README.md
            cluster-paintings.py
            elements-by-episode.csv
    great_expectations/
        .gitignore
        great_expectations.yml
        expectations/
            data__dir/
                default/
                    bob-ross/
                        BasicDatasetProfiler.json
                        failure.json
                        quarantine.json
                        warning.json
        notebooks/
            create_expectations.ipynb
            integrate_validation_into_pipeline.ipynb
        uncommitted/
            documentation/
                local_site/
                    index.html
                    expectations/
                        data__dir/
                            default/
                                bob-ross/
                                    BasicDatasetProfiler.html
                    profiling/
                        data__dir/
                            default/
                                bob-ross/
                                    BasicDatasetProfiler.html
                team_site/
                    index.html
                    expectations/
                        data__dir/
                            default/
                                bob-ross/
                                    BasicDatasetProfiler.html
            validations/
                profiling/
                    data__dir/
                        default/
                            bob-ross/
                                BasicDatasetProfiler.json
"""

    data_context = DataContext(
        context_root_dir=os.path.join(project_path, "great_expectations"),
    )

    my_df = pd.DataFrame({"x": [1,2,3,4,5]})

    data_asset_name = "data__dir/default/bob-ross"
    data_context.create_expectation_suite(data_asset_name=data_asset_name, expectation_suite_name="default")
    batch = data_context.get_batch(data_asset_name=data_asset_name, expectation_suite_name="default",
                                   batch_kwargs=data_context.yield_batch_kwargs(data_asset_name))

    validation_store_path = os.path.join(project_path, "great_expectations/uncommitted/validations")
    assert gen_directory_tree_str(validation_store_path) == """\
validations/
    profiling/
        data__dir/
            default/
                bob-ross/
                    BasicDatasetProfiler.json
"""

    data_asset_identifier = DataAssetIdentifier("data__dir", "default", "bob-ross")
    results = data_context.run_validation_operator(
        assets_to_validate=[batch],
        run_id="test-100",
        validation_operator_name="errors_and_warnings_validation_operator",
    )

    print(gen_directory_tree_str(validation_store_path))
    assert gen_directory_tree_str(validation_store_path) == """\