Python BasicDatasetProfilerの例、great_expectations.profile.basic_dataset_profiler.BasicDatasetProfiler Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_profile.py プロジェクト: tsanikgr/great_expectations

def test_BasicDatasetProfiler_null_column():
    """
    The profiler should determine that null columns are of null cardinality and of null type and
    not to generate expectations specific to types and cardinality categories.

    We verify this by running the basic profiler on a Pandas dataset with an empty column
    and asserting the number of successful results for the empty columns.
    """
    toy_dataset = PandasDataset({"x": [1, 2, 3], "y": [None, None, None]})
    assert (len(
        toy_dataset.get_expectation_suite(
            suppress_warnings=True).expectations) == 0)

    expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset)

    # TODO: assert set - specific expectations
    assert (len([
        result for result in evr_config["results"]
        if result.expectation_config["kwargs"].get("column") == "y"
        and result.success
    ]) == 4)

    assert len([
        result for result in evr_config["results"]
        if result.expectation_config["kwargs"].get("column") == "y"
        and result.success
    ]) < len([
        result for result in evr_config["results"]
        if result.expectation_config["kwargs"].get("column") == "x"
        and result.success
    ])

コード例 #2

0

ファイルを表示

ファイル: test_profile.py プロジェクト: tsanikgr/great_expectations

def test_BasicDatasetProfiler_with_context(filesystem_csv_data_context):
    context = filesystem_csv_data_context

    context.create_expectation_suite("default")
    datasource = context.datasources["rad_datasource"]
    base_dir = datasource.config["batch_kwargs_generators"]["subdir_reader"][
        "base_directory"]
    batch_kwargs = {
        "datasource": "rad_datasource",
        "path": os.path.join(base_dir, "f1.csv"),
    }
    batch = context.get_batch(batch_kwargs, "default")
    expectation_suite, validation_results = BasicDatasetProfiler.profile(batch)

    assert expectation_suite.expectation_suite_name == "default"
    assert "BasicDatasetProfiler" in expectation_suite.meta
    assert set(expectation_suite.meta["BasicDatasetProfiler"].keys()) == {
        "created_by",
        "created_at",
        "batch_kwargs",
    }
    assert (expectation_suite.meta["BasicDatasetProfiler"]["batch_kwargs"] ==
            batch_kwargs)
    for exp in expectation_suite.expectations:
        assert "BasicDatasetProfiler" in exp.meta
        assert "confidence" in exp.meta["BasicDatasetProfiler"]

    assert set(validation_results.meta.keys()) == {
        "batch_kwargs",
        "batch_markers",
        "batch_parameters",
        "expectation_suite_name",
        "great_expectations.__version__",
        "run_id",
    }

コード例 #3

0

ファイルを表示

ファイル: test_profile.py プロジェクト: tsanikgr/great_expectations

def test_BasicDatasetProfiler_non_numeric_low_cardinality(
        non_numeric_low_card_dataset):
    """
    Unit test to check the expectations that BasicDatasetProfiler creates for a low cardinality
    non numeric column.
    The test is executed against all the backends (Pandas, Spark, etc.), because it uses
    the fixture.
    """
    expectations_config, evr_config = BasicDatasetProfiler.profile(
        non_numeric_low_card_dataset)

    assert set([
        "expect_column_to_exist",
        "expect_column_values_to_be_in_type_list",
        "expect_column_unique_value_count_to_be_between",
        'expect_column_distinct_values_to_be_in_set',
        "expect_column_proportion_of_unique_values_to_be_between",
        "expect_column_values_to_not_be_null",
        "expect_column_values_to_be_in_set",
        "expect_column_values_to_not_match_regex",
    ]) == set([
        expectation.expectation_type
        for expectation in expectations_config.expectations
        if expectation.kwargs.get("column") == "lowcardnonnum"
    ])

コード例 #4

0

ファイルを表示

ファイル: test_profile.py プロジェクト: tsanikgr/great_expectations

def test_BasicDatasetProfiler_numeric_high_cardinality(
        numeric_high_card_dataset):
    """
    Unit test to check the expectations that BasicDatasetProfiler creates for a high cardinality
    numeric column.
    The test is executed against all the backends (Pandas, Spark, etc.), because it uses
    the fixture.
    """
    expectations_config, evr_config = BasicDatasetProfiler.profile(
        numeric_high_card_dataset)

    assert set([
        "expect_column_to_exist",
        "expect_table_row_count_to_be_between",
        "expect_table_columns_to_match_ordered_list",
        "expect_column_values_to_be_in_type_list",
        "expect_column_unique_value_count_to_be_between",
        "expect_column_proportion_of_unique_values_to_be_between",
        "expect_column_values_to_not_be_null",
        "expect_column_values_to_be_in_set",
        "expect_column_values_to_be_unique",
    ]) == set([
        expectation.expectation_type
        for expectation in expectations_config.expectations
    ])

コード例 #5

0

ファイルを表示

def test_BasicDatasetProfiler_null_column():
    """
    The profiler should determine that null columns are of null cardinality and of null type and
    not to generate expectations specific to types and cardinality categories.

    We verify this by running the basic profiler on a Pandas dataset with an empty column
    and asserting the number of successful results for the empty columns.
    """
    toy_dataset = PandasDataset({
        "x": [1, 2, 3],
        "y": [None, None, None]
    },
                                data_asset_name="toy_dataset")
    assert len(
        toy_dataset.get_expectation_suite(
            suppress_warnings=True)["expectations"]) == 0

    expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset)

    # TODO: assert set - specific expectations
    assert len([
        result for result in evr_config['results']
        if result['expectation_config']['kwargs'].get('column') == 'y'
        and result['success']
    ]) == 4


    assert len([result for result in evr_config['results'] if
                result['expectation_config']['kwargs'].get('column') == 'y' and result['success']]) < \
           len([result for result in evr_config['results'] if
                result['expectation_config']['kwargs'].get('column') == 'x' and result['success']])

コード例 #6

0

ファイルを表示

def test_BasicDatasetProfiler_with_context(empty_data_context,
                                           filesystem_csv_2):
    empty_data_context.add_datasource("my_datasource",
                                      "pandas",
                                      base_directory=str(filesystem_csv_2))
    not_so_empty_data_context = empty_data_context

    batch = not_so_empty_data_context.get_batch("my_datasource/f1")
    expectations_config, validation_results = BasicDatasetProfiler.profile(
        batch)

    # print(batch.get_batch_kwargs())
    # print(json.dumps(expectations_config, indent=2))

    assert expectations_config["data_asset_name"] == "my_datasource/default/f1"
    assert expectations_config["expectation_suite_name"] == "default"
    assert "BasicDatasetProfiler" in expectations_config["meta"]
    assert set(expectations_config["meta"]["BasicDatasetProfiler"].keys()) == {
        "created_by", "created_at", "batch_kwargs"
    }

    for exp in expectations_config["expectations"]:
        assert "BasicDatasetProfiler" in exp["meta"]
        assert "confidence" in exp["meta"]["BasicDatasetProfiler"]

    assert validation_results["meta"][
        "data_asset_name"] == "my_datasource/default/f1"
    assert set(validation_results["meta"].keys()) == {
        "great_expectations.__version__", "data_asset_name",
        "expectation_suite_name", "run_id", "batch_kwargs"
    }

コード例 #7

0

ファイルを表示

def test_BasicDatasetProfiler_partially_null_column(dataset):
    """
    Unit test to check the expectations that BasicDatasetProfiler creates for a partially null column.
    The test is executed against all the backends (Pandas, Spark, etc.), because it uses
    the fixture.

    "nulls" is the partially null column in the fixture dataset
    """
    expectations_config, evr_config = BasicDatasetProfiler.profile(dataset)

    assert set(["expect_column_to_exist", "expect_column_values_to_be_in_type_list", "expect_column_unique_value_count_to_be_between", "expect_column_proportion_of_unique_values_to_be_between", "expect_column_values_to_not_be_null", "expect_column_values_to_be_in_set", "expect_column_values_to_be_unique"]) == \
           set([expectation['expectation_type'] for expectation in expectations_config["expectations"] if expectation["kwargs"].get("column") == "nulls"])

コード例 #8

0

ファイルを表示

def test_BasicDatasetProfiler(mock_emit):
    toy_dataset = PandasDataset({"x": [1, 2, 3]}, )
    assert (len(
        toy_dataset.get_expectation_suite(
            suppress_warnings=True).expectations) == 0)

    expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset)

    assert (len(
        toy_dataset.get_expectation_suite(suppress_warnings=True).expectations)
            > 0)

    assert "BasicDatasetProfiler" in expectations_config.meta

    assert set(expectations_config.meta["BasicDatasetProfiler"].keys()) == {
        "created_by",
        "created_at",
        "batch_kwargs",
    }

    assert "notes" in expectations_config.meta
    assert set(
        expectations_config.meta["notes"].keys()) == {"format", "content"}
    assert "To add additional notes" in expectations_config.meta["notes"][
        "content"][0]

    added_expectations = set()
    for exp in expectations_config.expectations:
        added_expectations.add(exp.expectation_type)
        assert "BasicDatasetProfiler" in exp.meta
        assert "confidence" in exp.meta["BasicDatasetProfiler"]

    expected_expectations = {
        "expect_table_row_count_to_be_between",
        "expect_table_columns_to_match_ordered_list",
        "expect_column_values_to_be_in_set",
        "expect_column_unique_value_count_to_be_between",
        "expect_column_proportion_of_unique_values_to_be_between",
        "expect_column_values_to_not_be_null",
        "expect_column_values_to_be_in_type_list",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations.issubset(added_expectations)

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []

コード例 #9

0

ファイルを表示

def test_BasicDatasetProfiler():
    toy_dataset = PandasDataset({"x": [1, 2, 3]},
                                data_asset_name="toy_dataset")
    assert len(
        toy_dataset.get_expectation_suite(
            suppress_warnings=True)["expectations"]) == 0

    expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset)

    # print(json.dumps(expectations_config, indent=2))

    assert len(
        toy_dataset.get_expectation_suite(
            suppress_warnings=True)["expectations"]) > 0

    assert expectations_config["data_asset_name"] == "toy_dataset"
    assert "BasicDatasetProfiler" in expectations_config["meta"]

    assert set(expectations_config["meta"]["BasicDatasetProfiler"].keys()) == {
        "created_by", "created_at"
    }

    assert "notes" in expectations_config["meta"]
    assert set(
        expectations_config["meta"]["notes"].keys()) == {"format", "content"}
    assert "To add additional notes" in expectations_config["meta"]["notes"][
        "content"][0]

    added_expectations = set()
    for exp in expectations_config["expectations"]:
        added_expectations.add(exp["expectation_type"])
        assert "BasicDatasetProfiler" in exp["meta"]
        assert "confidence" in exp["meta"]["BasicDatasetProfiler"]

    expected_expectations = {
        'expect_table_row_count_to_be_between',
        'expect_table_columns_to_match_ordered_list',
        'expect_column_values_to_be_in_set',
        'expect_column_unique_value_count_to_be_between',
        'expect_column_proportion_of_unique_values_to_be_between',
        'expect_column_values_to_not_be_null',
        'expect_column_values_to_be_in_type_list',
        'expect_column_values_to_be_unique'
    }

    assert expected_expectations.issubset(added_expectations)

コード例 #10

0

ファイルを表示

ファイル: test_profile.py プロジェクト: tsanikgr/great_expectations

def test_BasicDatasetProfiler():
    toy_dataset = PandasDataset({"x": [1, 2, 3]}, )
    assert (len(
        toy_dataset.get_expectation_suite(
            suppress_warnings=True).expectations) == 0)

    expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset)

    assert (len(
        toy_dataset.get_expectation_suite(suppress_warnings=True).expectations)
            > 0)

    assert "BasicDatasetProfiler" in expectations_config.meta

    assert set(expectations_config.meta["BasicDatasetProfiler"].keys()) == {
        "created_by",
        "created_at",
        "batch_kwargs",
    }

    assert "notes" in expectations_config.meta
    assert set(
        expectations_config.meta["notes"].keys()) == {"format", "content"}
    assert "To add additional notes" in expectations_config.meta["notes"][
        "content"][0]

    added_expectations = set()
    for exp in expectations_config.expectations:
        added_expectations.add(exp.expectation_type)
        assert "BasicDatasetProfiler" in exp.meta
        assert "confidence" in exp.meta["BasicDatasetProfiler"]

    expected_expectations = {
        "expect_table_row_count_to_be_between",
        "expect_table_columns_to_match_ordered_list",
        "expect_column_values_to_be_in_set",
        "expect_column_unique_value_count_to_be_between",
        "expect_column_proportion_of_unique_values_to_be_between",
        "expect_column_values_to_not_be_null",
        "expect_column_values_to_be_in_type_list",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations.issubset(added_expectations)