def _add_checkpoint(
    context: BaseDataContext,
    backend_api: str,
    datasource_name: str,
    data_connector_name: str,
    checkpoint_name: str,
    suite_and_asset_names=[],
) -> SimpleCheckpoint:
    if backend_api == "V3":
        validations = [
            {
                "expectation_suite_name": suite_and_asset_name,
                "batch_request": {
                    "datasource_name": datasource_name,
                    "data_connector_name": data_connector_name,
                    "data_asset_name": suite_and_asset_name,
                    "batch_spec_passthrough": {"create_temp_table": False},
                },
            }
            for suite_and_asset_name in suite_and_asset_names
        ]
        return context.add_checkpoint(
            name=checkpoint_name,
            class_name="SimpleCheckpoint",
            validations=validations,
            run_name_template="my_run_name",
        )
    elif backend_api == "V2":
        batches = [
            {
                "expectation_suite_names": [suite_and_asset_name],
                "batch_kwargs": {
                    "datasource": datasource_name,
                    "data_asset_name": suite_and_asset_name,
                    "table": suite_and_asset_name,
                    "batch_spec_passthrough": {"create_temp_table": False},
                },
            }
            for suite_and_asset_name in suite_and_asset_names
        ]
        return context.add_checkpoint(
            name=checkpoint_name,
            class_name="LegacyCheckpoint",
            batches=batches,
        )
    else:
        raise ValueError(f"Unsupported backend_api {backend_api}")
Beispiel #2
0
                "reader_options": {
                    "header": True,
                },
            },
        },
        "expectation_suite_name": expectation_suite_name,
    }],
}

# For the purposes of this script, the data_asset_name includes "sample"
checkpoint_config["validations"][0]["batch_request"][
    "data_asset_name"] = "yellow_tripdata_sample"

my_checkpoint = context.test_yaml_config(yaml.dump(checkpoint_config))

context.add_checkpoint(**checkpoint_config)

checkpoint_result = context.run_checkpoint(
    checkpoint_name=my_checkpoint_name, )
# CODE ^^^^^ ^^^^^

# NOTE: The following code is only for testing and can be ignored by users.
# ASSERTIONS vvvvv vvvvv
assert checkpoint_result.checkpoint_config["name"] == my_checkpoint_name
assert not checkpoint_result.success
first_validation_result_identifier = (
    checkpoint_result.list_validation_result_identifiers()[0])
first_run_result = checkpoint_result.run_results[
    first_validation_result_identifier]
assert (first_run_result["validation_result"]["statistics"]
        ["successful_expectations"] == 1)
Beispiel #3
0
class GreatExpectationsHelpers(object):
    """
        Great Expectations helper class

        Provides basic utilities to:
            1) Create GE workflow specific to manifest according to validation rules
            2) Parse results dict to generate appropriate errors
    """
    def __init__(self,
        sg,
        unimplemented_expectations,
        manifest,
        manifestPath
        ):
        """
            Purpose:
                Instantiate a great expectations helpers object
            Args:
                sg: 
                    schemaGenerator object
                unimplemented_expectations:
                    dictionary of validation rules that currently do not have expectations developed
                manifest:
                    manifest being validated
                manifestPath:
                    path to manifest being validated            
        """
        self.unimplemented_expectations = unimplemented_expectations
        self.sg = sg
        self.manifest = manifest
        self.manifestPath = manifestPath

    def  build_context(self):
        """
            Purpose:
                Create a dataContext and datasource and add to object 
            Returns:
                saves dataContext and datasource to self
        """
        self.context=ge.get_context()

        #create datasource configuration
        datasource_config = {
            "name": "example_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                },
            },
        }

        #create data context configuration
        data_context_config = DataContextConfig(
            datasources={
                "pandas": DatasourceConfig(
                    class_name="Datasource",
                    execution_engine={
                        "class_name": "PandasExecutionEngine"
                    },
                    data_connectors={
                        "default_runtime_data_connector_name": {
                            "class_name": "RuntimeDataConnector",
                            "batch_identifiers": ["default_identifier_name"],
                        }
                    },
                )
            },
            store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')),
        )

        #build context and add data source
        self.context=BaseDataContext(project_config=data_context_config)
        #self.context.test_yaml_config(yaml.dump(datasource_config))
        self.context.add_datasource(**datasource_config)

        
    def build_expectation_suite(self,):
        """
            Purpose:
                Construct an expectation suite to validate columns with rules that have expectations
                Add suite to object
            Input:
                
            Returns:
                saves expectation suite and identifier to self
            
        """
        validation_expectation = {
            "int": "expect_column_values_to_be_in_type_list",
            "float": "expect_column_values_to_be_in_type_list",
            "str": "expect_column_values_to_be_of_type",
            "num": "expect_column_values_to_be_in_type_list",
            "recommended": "expect_column_values_to_not_match_regex_list",
            "protectAges": "expect_column_values_to_be_between",
            "unique": "expect_column_values_to_be_unique",
            "inRange": "expect_column_values_to_be_between",
            
            # To be implemented rules with possible expectations
            #"list": "expect_column_values_to_not_match_regex_list",
            #"regex": "expect_column_values_to_match_regex",
            #"url": "expect_column_values_to_be_valid_urls",
            #"matchAtLeastOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
            #"matchExactlyOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
        }
        
        #create blank expectation suite
        expectation_suite_name = "Manifest_test_suite"       
        self.suite = self.context.create_expectation_suite(
            expectation_suite_name=expectation_suite_name,
            overwrite_existing=True
        )    

        #build expectation configurations for each expecation
        for col in self.manifest.columns:
            args={}
            meta={}
            
            # remove trailing/leading whitespaces from manifest
            self.manifest.applymap(lambda x: x.strip() if isinstance(x, str) else x)
            validation_rules = self.sg.get_node_validation_rules(col)

            #check if attribute has any rules associated with it
            if validation_rules:
                #iterate through all validation rules for an attribute
                for rule in validation_rules:
                    
            
                    #check if rule has an implemented expectation
                    if re.match(self.unimplemented_expectations,rule):
                        continue

                
                    args["column"] = col
                    args["result_format"] = "COMPLETE"

                    #Validate num
                    if rule=='num':
                        args["mostly"]=1.0
                        args["type_list"]=['int','int64', 'float', 'float64']
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column values to be of int or float type. **Markdown** `Supported`"
                            },
                            "validation_rule": rule
                        }
                
                    #Validate float
                    elif rule=='float':
                        args["mostly"]=1.0
                        args["type_list"]=['float', 'float64']
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column values to be of float type. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }
                
                    #Validate int
                    elif rule=='int':
                        args["mostly"]=1.0
                        args["type_list"]=['int','int64']
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column values to be of int type. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }
                
                    #Validate string
                    elif rule=='str':
                        args["mostly"]=1.0
                        args["type_"]='str'
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column values to be of string type. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }

                    elif rule.startswith("recommended"):
                        args["mostly"]=0.0000000001
                        args["regex_list"]=['^$']
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column to not be empty. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }

                    elif rule.startswith("protectAges"):
                        #Function to convert to different age limit formats
                        min_age, max_age = self.get_age_limits()

                        args["mostly"]=1.0
                        args["min_value"]=min_age
                        args["max_value"]=max_age
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect ages to be between 18 years (6,570 days) and 90 years (32,850 days) of age. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }

                    elif rule.startswith("unique"):
                        args["mostly"]=1.0
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column values to be Unique. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }
                    
                    elif rule.startswith("inRange"):
                        args["mostly"]=1.0
                        args["min_value"]=float(rule.split(" ")[1])
                        args["max_value"]=float(rule.split(" ")[2])
                        meta={
                            "notes": {
                                "format": "markdown",
                                "content": "Expect column values to be Unique. **Markdown** `Supported`",
                            },
                            "validation_rule": rule
                        }
                                        
                    #add expectation for attribute to suite        
                    self.add_expectation(
                        rule=rule,
                        args=args,
                        meta=meta,
                        validation_expectation=validation_expectation,
                    )
        
            
        self.context.save_expectation_suite(expectation_suite=self.suite, expectation_suite_name=expectation_suite_name)

        suite_identifier = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name)
        self.context.build_data_docs(resource_identifiers=[suite_identifier])
        ##Webpage DataDocs opened here:
        #self.context.open_data_docs(resource_identifier=suite_identifier) 

    def add_expectation(
        self,
        rule: str,
        args: Dict,
        meta: Dict,
        validation_expectation: Dict,
        ):
        """
            Purpose:
                Add individual expectation for a rule to the suite
            Input:
                rule: 
                    validation rule
                args: 
                    dict of arguments specifying expectation behavior
                meta:
                    dict of additional information for each expectation
                validation_expectation:
                    dictionary to map between rules and expectations
            Returns:
                adds expectation to self.suite
            
        """
        # Create an Expectation
        expectation_configuration = ExpectationConfiguration(
            # Name of expectation type being added
            expectation_type=validation_expectation[rule.split(" ")[0]],

            #add arguments and meta message
            kwargs={**args},
            meta={**meta}
        )
        # Add the Expectation to the suite
        self.suite.add_expectation(expectation_configuration=expectation_configuration)

    def build_checkpoint(self):
        """
            Purpose:
                Build checkpoint to validate manifest
            Input:
            Returns:
                adds checkpoint to self 
        """
        #create manifest checkpoint
        checkpoint_name = "manifest_checkpoint"  
        checkpoint_config={
            "name": checkpoint_name,
            "config_version": 1,
            "class_name": "SimpleCheckpoint",
            "validations": [
                {
                    "batch_request": {
                        "datasource_name": "example_datasource",
                        "data_connector_name": "default_runtime_data_connector_name",
                        "data_asset_name": "Manifest",
                    },
                    "expectation_suite_name": "Manifest_test_suite",
                }
            ],
        }

        #self.context.test_yaml_config(yaml.dump(checkpoint_config),return_mode="report_object")        
        self.context.add_checkpoint(**checkpoint_config)
    
    def generate_errors(
        self,
        validation_results: Dict,
        validation_types: Dict,
        errors: List,
        warnings: List
        ):
        """
            Purpose:
                Parse results dictionary and generate errors for expectations
            Input:
                validation_results:
                    dictionary of results for each expectation
                validation_types:
                    dict of types of errors to generate for each validation rule
                errors:
                    list of errors
                warnings:
                    list of warnings    
            Returns:
                errors:
                    list of errors
                warnings:
                    list of warnings
                self.manifest:
                    manifest, possibly updated (censored ages)
        """

        type_dict={
            "float64": float,
            "int64": int,
            "str": str,
        }
        for result_dict in validation_results[0]['results']:

            
            indices = []
            values = []
            
            #if the expectaion failed, get infromation to generate error message
            if not result_dict['success']:
                errColumn   = result_dict['expectation_config']['kwargs']['column']               
                rule        = result_dict['expectation_config']['meta']['validation_rule']


                #only some expectations explicitly list unexpected values and indices, read or find if not present
                if 'unexpected_index_list' in result_dict['result']:
                    indices = result_dict['result']['unexpected_index_list']
                    values  = result_dict['result']['unexpected_list']

                # Technically, this shouldn't ever happen, but will keep as a failsafe in case many things go wrong
                # because type validation is column aggregate expectation and not column map expectation when columns are not of object type, 
                # indices and values cannot be returned
                else:
                    for i, item in enumerate(self.manifest[errColumn]):
                        observed_type=result_dict['result']['observed_value']
                        indices.append(i)   if isinstance(item,type_dict[observed_type]) else indices
                        values.append(item) if isinstance(item,type_dict[observed_type]) else values

                #call functions to generate error messages and add to error list
                if validation_types[rule.split(" ")[0]]=='type_validation':
                    for row, value in zip(indices,values):
                        errors.append(
                            GenerateError.generate_type_error(
                                val_rule = rule,
                                row_num = row+2,
                                attribute_name = errColumn,
                                invalid_entry = value,
                            )
                        )          
                elif validation_types[rule.split(" ")[0]]=='regex_validation':
                    expression=result_dict['expectation_config']['kwargs']['regex']

                    for row, value in zip(indices,values):   
                        errors.append(
                            GenerateError.generate_regex_error(
                                val_rule= rule,
                                reg_expression = expression,
                                row_num = row+2,
                                module_to_call = 'match',
                                attribute_name = errColumn,
                                invalid_entry = value,
                            )
                        )    
                elif validation_types[rule.split(" ")[0]]=='content_validation':     
                    content_errors, content_warnings = GenerateError.generate_content_error(
                                                            val_rule = rule, 
                                                            attribute_name = errColumn,
                                                            row_num = list(np.array(indices)+2),
                                                            error_val = values,  
                                                            sg = self.sg
                                                        )       
                    if content_errors:
                        errors.append(content_errors)  
                        if rule.startswith('protectAges'):
                            self.censor_ages(content_errors,errColumn)
                            pass
                    elif content_warnings:
                        warnings.append(content_warnings)  
                        if rule.startswith('protectAges'):
                            self.censor_ages(content_warnings,errColumn)
                            pass

        return errors, warnings

    def get_age_limits(
        self,
        ):
        """
            Purpose:
                Get boundaries of ages that need to be censored for different age formats
            Input:
            Returns:
                min_age:
                    minimum age that will not be censored
                max age:
                    maximum age that will not be censored
            
        """        

        min_age = 6550      #days
        max_age = 32849     #days

        return min_age, max_age

    def censor_ages(
        self,
        message: List,
        col: str,
        ):
        """
            Purpose:
                Censor ages in manifest as appropriate
            Input:
                message: 
                    error or warning message for age validation rule
                col:
                    name of column containing ages
            Returns:
                updates self.manifest with censored ages
            
        """
        
        censor_rows = list(np.array(message[0]) - 2) 
        self.manifest.loc[censor_rows,(col)] = 'age censored'

        # update the manifest file, so that ages are censored
        self.manifest.to_csv(self.manifestPath.replace('.csv','_censored.csv'), index=False)
        logging.info("Sensitive ages have been censored.")

        return
      data_connector_query:
        index: -1
      batch_spec_passthrough:
        reader_method: csv
        reader_options:
          header: True
    expectation_suite_name: {expectation_suite_name}
"""

# For the purposes of this script, the data_asset_name includes "sample"
checkpoint_config = checkpoint_config.replace("yellow_tripdata",
                                              "yellow_tripdata_sample")

my_checkpoint = context.test_yaml_config(checkpoint_config)

context.add_checkpoint(**yaml.load(checkpoint_config))

checkpoint_result = context.run_checkpoint(
    checkpoint_name=my_checkpoint_name, )
# CODE ^^^^^ ^^^^^

# NOTE: The following code is only for testing and can be ignored by users.
# ASSERTIONS vvvvv vvvvv
assert checkpoint_result.checkpoint_config["name"] == my_checkpoint_name
assert not checkpoint_result.success
first_validation_result_identifier = (
    checkpoint_result.list_validation_result_identifiers()[0])
first_run_result = checkpoint_result.run_results[
    first_validation_result_identifier]
assert (first_run_result["validation_result"]["statistics"]
        ["successful_expectations"] == 1)