def test__find_next_string_column(non_numeric_high_card_dataset, non_numeric_low_card_dataset): columns = non_numeric_high_card_dataset.get_table_columns() column_cache = {} profiled_columns = { "numeric": [], "low_card": [], "string": [], "datetime": [] } column = SampleExpectationsDatasetProfiler._find_next_string_column( non_numeric_high_card_dataset, columns, profiled_columns, column_cache) expected_columns = ["highcardnonnum", "medcardnonnum"] assert column in expected_columns profiled_columns["string"].append(column) expected_columns.remove(column) assert SampleExpectationsDatasetProfiler._find_next_string_column( non_numeric_high_card_dataset, columns, profiled_columns, column_cache) in expected_columns columns = non_numeric_low_card_dataset.get_table_columns() column_cache = {} profiled_columns = { "numeric": [], "low_card": [], "string": [], "datetime": [] } assert SampleExpectationsDatasetProfiler._find_next_string_column( non_numeric_low_card_dataset, columns, profiled_columns, column_cache) is None
def test__find_next_numeric_column(numeric_high_card_dataset, non_numeric_low_card_dataset): columns = numeric_high_card_dataset.get_table_columns() column_cache = {} profiled_columns = { "numeric": [], "low_card": [], "string": [], "datetime": [] } column = SampleExpectationsDatasetProfiler._find_next_numeric_column( numeric_high_card_dataset, columns, profiled_columns, column_cache) assert column == "norm_0_1" profiled_columns["numeric"].append(column) assert SampleExpectationsDatasetProfiler._find_next_numeric_column( numeric_high_card_dataset, columns, profiled_columns, column_cache) is None columns = non_numeric_low_card_dataset.get_table_columns() column_cache = {} profiled_columns = { "numeric": [], "low_card": [], "string": [], "datetime": [] } assert SampleExpectationsDatasetProfiler._find_next_numeric_column( non_numeric_low_card_dataset, columns, profiled_columns, column_cache) is None
def test__create_expectations_for_datetime_column(datetime_dataset): column = "datetime" expectation_suite = datetime_dataset.get_expectation_suite( suppress_warnings=True) assert len(expectation_suite.expectations) == 1 SampleExpectationsDatasetProfiler._create_expectations_for_datetime_column( datetime_dataset, column) expectation_suite = datetime_dataset.get_expectation_suite( suppress_warnings=True) assert set([ expectation.expectation_type for expectation in expectation_suite.expectations if expectation.kwargs.get("column") == column ]) == { "expect_column_to_exist", "expect_column_values_to_be_between", "expect_column_values_to_not_be_null" }
def test__create_expectations_for_string_column(non_numeric_high_card_dataset): column = "highcardnonnum" expectation_suite = non_numeric_high_card_dataset.get_expectation_suite( suppress_warnings=True) assert len(expectation_suite.expectations) == 2 SampleExpectationsDatasetProfiler._create_expectations_for_string_column( non_numeric_high_card_dataset, column) expectation_suite = non_numeric_high_card_dataset.get_expectation_suite( suppress_warnings=True) assert set([ expectation.expectation_type for expectation in expectation_suite.expectations if expectation.kwargs.get("column") == column ]) == { "expect_column_to_exist", "expect_column_values_to_not_be_null", "expect_column_value_lengths_to_be_between" }
def test__create_expectations_for_low_card_column( non_numeric_low_card_dataset): column = "lowcardnonnum" column_cache = {} expectation_suite = non_numeric_low_card_dataset.get_expectation_suite( suppress_warnings=True) assert len(expectation_suite.expectations) == 1 SampleExpectationsDatasetProfiler._create_expectations_for_low_card_column( non_numeric_low_card_dataset, column, column_cache) expectation_suite = non_numeric_low_card_dataset.get_expectation_suite( suppress_warnings=True) assert set([ expectation.expectation_type for expectation in expectation_suite.expectations if expectation.kwargs.get("column") == column ]) == { "expect_column_to_exist", 'expect_column_distinct_values_to_be_in_set', "expect_column_kl_divergence_to_be_less_than", "expect_column_values_to_not_be_null", }
def test__create_expectations_for_numeric_column(numeric_high_card_dataset, test_backend): column = "norm_0_1" expectation_suite = numeric_high_card_dataset.get_expectation_suite( suppress_warnings=True) assert len(expectation_suite.expectations) == 1 SampleExpectationsDatasetProfiler._create_expectations_for_numeric_column( numeric_high_card_dataset, column) expectation_suite = numeric_high_card_dataset.get_expectation_suite( suppress_warnings=True) if test_backend in ["PandasDataset", "SparkDFDataset", "postgresql"]: assert set([ expectation.expectation_type for expectation in expectation_suite.expectations if expectation.kwargs.get("column") == column ]) == { "expect_column_to_exist", "expect_column_min_to_be_between", "expect_column_max_to_be_between", "expect_column_mean_to_be_between", "expect_column_median_to_be_between", "expect_column_quantile_values_to_be_between", "expect_column_values_to_not_be_null" } else: assert set([ expectation.expectation_type for expectation in expectation_suite.expectations if expectation.kwargs.get("column") == column ]) == { "expect_column_to_exist", "expect_column_min_to_be_between", "expect_column_max_to_be_between", "expect_column_mean_to_be_between", "expect_column_median_to_be_between", "expect_column_values_to_not_be_null" }
def test_SampleExpectationsDatasetProfiler_with_context(not_empty_datacontext): context = not_empty_datacontext context.create_expectation_suite("default") datasource = context.datasources["rad_datasource"] base_dir = datasource.config["generators"]["subdir_reader"][ "base_directory"] batch_kwargs = { "datasource": "rad_datasource", "path": os.path.join(base_dir, "f1.csv"), } batch = context.get_batch(batch_kwargs, "default") expectation_suite, validation_results = SampleExpectationsDatasetProfiler.profile( batch) assert expectation_suite.expectation_suite_name == "default" assert "SampleExpectationsDatasetProfiler" in expectation_suite.meta assert set(expectation_suite.meta["SampleExpectationsDatasetProfiler"]. keys()) == { "created_by", "created_at", "batch_kwargs", } assert (expectation_suite.meta["SampleExpectationsDatasetProfiler"] ["batch_kwargs"] == batch_kwargs) for exp in expectation_suite.expectations: assert "SampleExpectationsDatasetProfiler" in exp.meta assert "confidence" in exp.meta["SampleExpectationsDatasetProfiler"] assert set(validation_results.meta.keys()) == { "batch_kwargs", "batch_markers", "batch_parameters", "expectation_suite_name", "great_expectations.__version__", "run_id", } assert expectation_suite.meta["notes"] == { "format": "markdown", "content": [ """#### This is an _example_ suite - This suite was made by quickly glancing at 1000 rows of your data. - This is **not a production suite**. It is meant to show examples of expectations. - Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful. """ ] } expectation_types = [ expectation["expectation_type"] for expectation in expectation_suite.expectations ] expected_expectation_types = { 'expect_table_row_count_to_be_between', 'expect_table_column_count_to_equal', 'expect_table_columns_to_match_ordered_list', 'expect_column_values_to_not_be_null', 'expect_column_min_to_be_between', 'expect_column_max_to_be_between', 'expect_column_mean_to_be_between', 'expect_column_median_to_be_between', 'expect_column_quantile_values_to_be_between' } assert set(expectation_types) == expected_expectation_types