def test_ge_pandas_sampling(): df = ge.dataset.PandasDataset( { "A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": ["a", "b", "c", "d"], "D": ["e", "f", "g", "h"], } ) # Put some simple expectations on the data frame df.profile(profiler=ColumnsExistProfiler) df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ["a", "b", "c", "d"]) df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "h"]) exp1 = df.find_expectations() # The sampled data frame should: # # 1. Be a ge.dataset.PandaDataSet # 2. Inherit ALL the expectations of the parent data frame samp1 = df.sample(n=2) assert isinstance(samp1, ge.dataset.PandasDataset) assert samp1.find_expectations() == exp1 samp1 = df.sample(frac=0.25, replace=True) assert isinstance(samp1, ge.dataset.PandasDataset) assert samp1.find_expectations() == exp1 # Change expectation on column "D", sample, and check expectations. # The failing expectation on column "D" is NOT automatically dropped # in the sample. df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "x"]) samp1 = df.sample(n=2) exp1 = expectationSuiteSchema.load( { "expectation_suite_name": "test", "expectations": [ { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "A"}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "B"}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "C"}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "D"}, }, { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "A", "value_set": [1, 2, 3, 4]}, }, { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "B", "value_set": [5, 6, 7, 8]}, }, { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "C", "value_set": ["a", "b", "c", "d"]}, }, { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "D", "value_set": ["e", "f", "g", "x"]}, }, ], } ) assert samp1.find_expectations() == exp1.expectations
def test_ge_pandas_sampling(): df = ge.dataset.PandasDataset({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8], 'C': ['a', 'b', 'c', 'd'], 'D': ['e', 'f', 'g', 'h'] }) # Put some simple expectations on the data frame df.profile(profiler=ColumnsExistProfiler) df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ['a', 'b', 'c', 'd']) df.expect_column_values_to_be_in_set("D", ['e', 'f', 'g', 'h']) exp1 = df.find_expectations() # The sampled data frame should: # # 1. Be a ge.dataset.PandaDataSet # 2. Inherit ALL the expectations of the parent data frame samp1 = df.sample(n=2) assert isinstance(samp1, ge.dataset.PandasDataset) assert samp1.find_expectations() == exp1 samp1 = df.sample(frac=0.25, replace=True) assert isinstance(samp1, ge.dataset.PandasDataset) assert samp1.find_expectations() == exp1 # Change expectation on column "D", sample, and check expectations. # The failing expectation on column "D" is NOT automatically dropped # in the sample. df.expect_column_values_to_be_in_set("D", ['e', 'f', 'g', 'x']) samp1 = df.sample(n=2) exp1 = expectationSuiteSchema.load({ 'data_asset_name': 'test', 'expectation_suite_name': 'test', "expectations": [{ 'expectation_type': 'expect_column_to_exist', 'kwargs': { 'column': 'A' } }, { 'expectation_type': 'expect_column_to_exist', 'kwargs': { 'column': 'B' } }, { 'expectation_type': 'expect_column_to_exist', 'kwargs': { 'column': 'C' } }, { 'expectation_type': 'expect_column_to_exist', 'kwargs': { 'column': 'D' } }, { 'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': { 'column': 'A', 'value_set': [1, 2, 3, 4] } }, { 'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': { 'column': 'B', 'value_set': [5, 6, 7, 8] } }, { 'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': { 'column': 'C', 'value_set': ['a', 'b', 'c', 'd'] } }, { 'expectation_type': 'expect_column_values_to_be_in_set', 'kwargs': { 'column': 'D', 'value_set': ['e', 'f', 'g', 'x'] } }] }).data assert samp1.find_expectations() == exp1.expectations
def validate( data_asset, expectation_suite=None, data_asset_name=None, expectation_suite_name=None, data_context=None, data_asset_class_name=None, data_asset_module_name="great_expectations.dataset", data_asset_class=None, *args, **kwargs): """Validate the provided data asset. Validate can accept an optional data_asset_name to apply, data_context to use to fetch an expectation_suite if one is not provided, and data_asset_class_name/data_asset_module_name or data_asset_class to use to provide custom expectations. Args: data_asset: the asset to validate expectation_suite: the suite to use, or None to fetch one using a DataContext data_asset_name: the name of the data asset to use expectation_suite_name: the name of the expectation_suite to use data_context: data context to use to fetch an an expectation suite, or the path from which to obtain one data_asset_class_name: the name of a class to dynamically load a DataAsset class data_asset_module_name: the name of the module to dynamically load a DataAsset class data_asset_class: a class to use. overrides data_asset_class_name/ data_asset_module_name if provided *args: **kwargs: Returns: """ # Get an expectation suite if not provided if expectation_suite is None and data_context is None: raise ValueError( "Either an expectation suite or a DataContext is required for validation.") if expectation_suite is None: logger.info("Using expectation suite from DataContext.") # Allow data_context to be a string, and try loading it from path in that case if isinstance(data_context, str): from great_expectations.data_context import DataContext data_context = DataContext(data_context) expectation_suite = data_context.get_expectation_suite( expectation_suite_name=expectation_suite_name ) else: if isinstance(expectation_suite, dict): expectation_suite = expectationSuiteSchema.load(expectation_suite) if data_asset_name is not None: raise ValueError("When providing an expectation suite, data_asset_name cannot also be provided.") if expectation_suite_name is not None: raise ValueError("When providing an expectation suite, expectation_suite_name cannot also be provided.") logger.info( "Validating data_asset_name %s with expectation_suite_name %s" % (data_asset_name, expectation_suite.expectation_suite_name) ) # If the object is already a DataAsset type, then this is purely a convenience method # and no conversion is needed; try to run validate on the given object if data_asset_class_name is None and data_asset_class is None: return data_asset.validate(expectation_suite=expectation_suite, data_context=data_context, *args, **kwargs) # Otherwise, try to convert and validate the dataset if data_asset_class is None: verify_dynamic_loading_support(module_name=data_asset_module_name) data_asset_class = load_class(data_asset_class_name, data_asset_module_name) import pandas as pd from great_expectations.dataset import Dataset, PandasDataset if data_asset_class is None: # Guess the GE data_asset_type based on the type of the data_asset if isinstance(data_asset, pd.DataFrame): data_asset_class = PandasDataset # Add other data_asset_type conditions here as needed # Otherwise, we will convert for the user to a subclass of the # existing class to enable new expectations, but only for datasets if not isinstance(data_asset, (Dataset, pd.DataFrame)): raise ValueError( "The validate util method only supports dataset validations, including custom subclasses. For other data " "asset types, use the object's own validate method." ) if not issubclass(type(data_asset), data_asset_class): if isinstance(data_asset, pd.DataFrame) and issubclass(data_asset_class, PandasDataset): pass # This is a special type of allowed coercion else: raise ValueError( "The validate util method only supports validation for subtypes of the provided data_asset_type.") data_asset_ = _convert_to_dataset_class(data_asset, dataset_class=data_asset_class, expectation_suite=expectation_suite) return data_asset_.validate(*args, data_context=data_context, **kwargs)
def test_data_context_updates_expectation_suite_names(data_context): # A data context should update the data_asset_name and expectation_suite_name of expectation suites # that it creates when it saves them. expectation_suites = data_context.list_expectation_suites() # We should have a single expectation suite defined assert len(expectation_suites) == 1 expectation_suite_name = expectation_suites[0].expectation_suite_name # We'll get that expectation suite and then update its name and re-save, then verify that everything # has been properly updated expectation_suite = data_context.get_expectation_suite(expectation_suite_name) # Note we codify here the current behavior of having a string data_asset_name though typed ExpectationSuite objects # will enable changing that assert expectation_suite.expectation_suite_name == expectation_suite_name # We will now change the data_asset_name and then save the suite in three ways: # 1. Directly using the new name, # 2. Using a different name that should be overwritten # 3. Using the new name but having the context draw that from the suite # Finally, we will try to save without a name (deleting it first) to demonstrate that saving will fail. expectation_suite.expectation_suite_name = 'a_new_suite_name' data_context.save_expectation_suite( expectation_suite=expectation_suite, expectation_suite_name='a_new_suite_name' ) fetched_expectation_suite = data_context.get_expectation_suite('a_new_suite_name') assert fetched_expectation_suite.expectation_suite_name == 'a_new_suite_name' # 2. Using a different name that should be overwritten data_context.save_expectation_suite( expectation_suite=expectation_suite, expectation_suite_name='a_new_new_suite_name' ) fetched_expectation_suite = data_context.get_expectation_suite('a_new_new_suite_name') assert fetched_expectation_suite.expectation_suite_name == 'a_new_new_suite_name' # Check that the saved name difference is actually persisted on disk with open(os.path.join( data_context.root_directory, "expectations", "a_new_new_suite_name.json" ), 'r') as suite_file: loaded_suite = expectationSuiteSchema.load(json.load(suite_file)).data assert loaded_suite.expectation_suite_name == 'a_new_new_suite_name' # 3. Using the new name but having the context draw that from the suite expectation_suite.expectation_suite_name = "a_third_suite_name" data_context.save_expectation_suite( expectation_suite=expectation_suite ) fetched_expectation_suite = data_context.get_expectation_suite("a_third_suite_name") assert fetched_expectation_suite.expectation_suite_name == "a_third_suite_name"