def _split_multi_column_metric_domain_kwargs( domain_kwargs: Dict, domain_type: MetricDomainTypes, ) -> SplitDomainKwargs: """Split domain_kwargs for multicolumn domain types into compute and accessor domain kwargs. Args: domain_kwargs: A dictionary consisting of the domain kwargs specifying which data to obtain domain_type: an Enum value indicating which metric domain the user would like to be using. Returns: compute_domain_kwargs, accessor_domain_kwargs from domain_kwargs The union of compute_domain_kwargs, accessor_domain_kwargs is the input domain_kwargs """ assert (domain_type == MetricDomainTypes.MULTICOLUMN ), "This method only supports MetricDomainTypes.MULTICOLUMN" compute_domain_kwargs: Dict = copy.deepcopy(domain_kwargs) accessor_domain_kwargs: Dict = {} if "column_list" not in domain_kwargs: raise ge_exceptions.GreatExpectationsError( "column_list not found within domain_kwargs") column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise ge_exceptions.GreatExpectationsError( "column_list must contain at least 2 columns") accessor_domain_kwargs["column_list"] = column_list return SplitDomainKwargs(compute_domain_kwargs, accessor_domain_kwargs)
def _split_column_pair_metric_domain_kwargs( domain_kwargs: Dict, domain_type: MetricDomainTypes, ) -> SplitDomainKwargs: """Split domain_kwargs for column pair domain types into compute and accessor domain kwargs. Args: domain_kwargs: A dictionary consisting of the domain kwargs specifying which data to obtain domain_type: an Enum value indicating which metric domain the user would like to be using. Returns: compute_domain_kwargs, accessor_domain_kwargs from domain_kwargs The union of compute_domain_kwargs, accessor_domain_kwargs is the input domain_kwargs """ assert (domain_type == MetricDomainTypes.COLUMN_PAIR ), "This method only supports MetricDomainTypes.COLUMN_PAIR" compute_domain_kwargs: Dict = copy.deepcopy(domain_kwargs) accessor_domain_kwargs: Dict = {} if not ("column_A" in domain_kwargs and "column_B" in domain_kwargs): raise ge_exceptions.GreatExpectationsError( "column_A or column_B not found within domain_kwargs") accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B") return SplitDomainKwargs(compute_domain_kwargs, accessor_domain_kwargs)
def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet, ge_context: DataContext) -> str: from great_expectations.datasource import SparkDFDatasource path = str(dataset._filepath.parent) if path.startswith("./"): path = path[2:] configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } }) configuration["class_name"] = "SparkDFDatasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {0:s}".format(errors)) ge_context.add_datasource(name=datasource_name, **configuration) return datasource_name
def load_batch_data(self, batch_id: str, batch_data: Any) -> None: if isinstance(batch_data, pd.DataFrame): batch_data = PandasBatchData(self, batch_data) elif isinstance(batch_data, PandasBatchData): pass else: raise ge_exceptions.GreatExpectationsError( "PandasExecutionEngine requires batch data that is either a DataFrame or a PandasBatchData object" ) super().load_batch_data(batch_id=batch_id, batch_data=batch_data)
def add_column_row_condition(self, domain_kwargs, column_name=None, filter_null=True, filter_nan=False): """EXPERIMENTAL Add a row condition for handling null filter. Args: domain_kwargs: the domain kwargs to use as the base and to which to add the condition column_name: if provided, use this name to add the condition; otherwise, will use "column" key from table_domain_kwargs filter_null: if true, add a filter for null values filter_nan: if true, add a filter for nan values """ if filter_null is False and filter_nan is False: logger.warning( "add_column_row_condition called with no filter condition requested" ) return domain_kwargs if filter_nan: raise ge_exceptions.GreatExpectationsError( "Base ExecutionEngine does not support adding nan condition filters" ) if "row_condition" in domain_kwargs and domain_kwargs["row_condition"]: raise ge_exceptions.GreatExpectationsError( "ExecutionEngine does not support updating existing row_conditions." ) new_domain_kwargs = copy.deepcopy(domain_kwargs) assert "column" in domain_kwargs or column_name is not None if column_name is not None: column = column_name else: column = domain_kwargs["column"] new_domain_kwargs[ "condition_parser"] = "great_expectations__experimental__" new_domain_kwargs["row_condition"] = f'col("{column}").notnull()' return new_domain_kwargs
def add_column_row_condition(self, domain_kwargs, column_name=None, filter_null=True, filter_nan=False): """EXPERIMENTAL Add a row condition for handling null filter. Args: domain_kwargs: the domain kwargs to use as the base and to which to add the condition column_name: if provided, use this name to add the condition; otherwise, will use "column" key from table_domain_kwargs filter_null: if true, add a filter for null values filter_nan: if true, add a filter for nan values """ if filter_null is False and filter_nan is False: logger.warning( "add_column_row_condition called with no filter condition requested" ) return domain_kwargs if filter_nan: raise ge_exceptions.GreatExpectationsError( "Base ExecutionEngine does not support adding nan condition filters" ) new_domain_kwargs = copy.deepcopy(domain_kwargs) assert ( "column" in domain_kwargs or column_name is not None ), "No column provided: A column must be provided in domain_kwargs or in the column_name parameter" if column_name is not None: column = column_name else: column = domain_kwargs["column"] row_condition: RowCondition = RowCondition( condition=f'col("{column}").notnull()', condition_type=RowConditionParserType.GE, ) new_domain_kwargs.setdefault("filter_conditions", []).append(row_condition) return new_domain_kwargs
def get_compute_domain( self, domain_kwargs: dict, domain_type: Union[str, "MetricDomainTypes"], accessor_keys: Optional[Iterable[str]] = [], ) -> Tuple[pd.DataFrame, dict, dict]: """Uses a given batch dictionary and domain kwargs (which include a row condition and a condition parser) to obtain and/or query a batch. Returns in the format of a Pandas DataFrame. If the domain is a single column, this is added to 'accessor domain kwargs' and used for later access Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or "MetricDomainTypes") - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: A tuple including: - a DataFrame (the data on which to compute) - a dictionary of compute_domain_kwargs, describing the DataFrame - a dictionary of accessor_domain_kwargs, describing any accessors needed to identify the domain within the compute domain """ # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) batch_id = domain_kwargs.get("batch_id") if batch_id is None: # We allow no batch id specified if there is only one batch if self.active_batch_data_id is not None: data = self.active_batch_data.dataframe else: raise ge_exceptions.ValidationError( "No batch is specified, but could not identify a loaded batch." ) else: if batch_id in self.loaded_batch_data_dict: data = self.loaded_batch_data_dict[batch_id].dataframe else: raise ge_exceptions.ValidationError( f"Unable to find batch with batch_id {batch_id}" ) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = dict() table = domain_kwargs.get("table", None) if table: raise ValueError( "PandasExecutionEngine does not currently support multiple named tables." ) # Filtering by row condition row_condition = domain_kwargs.get("row_condition", None) if row_condition: condition_parser = domain_kwargs.get("condition_parser", None) # Ensuring proper condition parser has been provided if condition_parser not in ["python", "pandas"]: raise ValueError( "condition_parser is required when setting a row_condition," " and must be 'python' or 'pandas'" ) else: # Querying row condition data = data.query(row_condition, parser=condition_parser).reset_index( drop=True ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if ( domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(accessor_keys) > 0 ): logger.warning( "Accessor keys ignored since Metric Domain Type is not 'table" ) # If given table (this is default), get all unexpected accessor_keys (an optional parameters allowing us to # modify domain access) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(accessor_keys) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key) if len(compute_domain_kwargs.keys()) > 0: for key in compute_domain_kwargs.keys(): # Warning user if kwarg not "normal" if key not in [ "batch_id", "table", "row_condition", "condition_parser", ]: logger.warning( f"Unexpected key {key} found in domain_kwargs for domain type {domain_type.value}" ) return data, compute_domain_kwargs, accessor_domain_kwargs # If user has stated they want a column, checking if one is provided, and elif domain_type == MetricDomainTypes.COLUMN: if "column" in compute_domain_kwargs: accessor_domain_kwargs["column"] = compute_domain_kwargs.pop("column") else: # If column not given raise ge_exceptions.GreatExpectationsError( "Column not provided in compute_domain_kwargs" ) # Else, if column pair values requested elif domain_type == MetricDomainTypes.COLUMN_PAIR: # Ensuring column_A and column_B parameters provided if ( "column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs ): accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A" ) accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B" ) else: raise ge_exceptions.GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) # Checking if table or identity or other provided, column is not specified. If it is, warning the user elif domain_type == MetricDomainTypes.MULTICOLUMN: if "columns" in compute_domain_kwargs: accessor_domain_kwargs["columns"] = compute_domain_kwargs.pop("columns") # Filtering if identity elif domain_type == MetricDomainTypes.IDENTITY: # If we would like our data to become a single column if "column" in compute_domain_kwargs: data = pd.DataFrame(data[compute_domain_kwargs["column"]]) # If we would like our data to now become a column pair elif ("column_A" in compute_domain_kwargs) and ( "column_B" in compute_domain_kwargs ): # Dropping all not needed columns column_a, column_b = ( compute_domain_kwargs["column_A"], compute_domain_kwargs["column_B"], ) data = pd.DataFrame( {column_a: data[column_a], column_b: data[column_b]} ) else: # If we would like our data to become a multicolumn if "columns" in compute_domain_kwargs: data = data[compute_domain_kwargs["columns"]] return data, compute_domain_kwargs, accessor_domain_kwargs
def _add_spark_datasource( context, passthrough_generator_only=True, prompt_for_datasource_name=True ): toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "spark"}, success=True, ) if not _verify_pyspark_dependent_modules(): return None if passthrough_generator_only: datasource_name = "files_spark_datasource" # configuration = SparkDFDatasource.build_configuration(batch_kwargs_generators={ # "default": { # "class_name": "PassthroughGenerator", # } # } # ) configuration = SparkDFDatasource.build_configuration() else: path = click.prompt( msg_prompt_filesys_enter_base_path, type=click.Path(exists=True, file_okay=False), ).strip() if path.startswith("./"): path = path[2:] if path.endswith("/"): basenamepath = path[:-1] else: basenamepath = path datasource_name = os.path.basename(basenamepath) + "__dir" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } } ) configuration["class_name"] = "SparkDFDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml: {:s} """.format( datasource_name, textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), " "), ) ) toolkit.confirm_proceed_or_exit() context.add_datasource(name=datasource_name, **configuration) return datasource_name
def _add_sqlalchemy_datasource(context, prompt_for_datasource_name=True): msg_success_database = ( "\n<green>Great Expectations connected to your database!</green>" ) if not _verify_sqlalchemy_dependent_modules(): return None db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))] selected_database = ( int( click.prompt( msg_prompt_choose_database, type=click.Choice(db_choices), show_choices=False, ) ) - 1 ) # don't show user a zero index list :) selected_database = list(SupportedDatabases)[selected_database] toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "sqlalchemy", "db": selected_database.name}, success=True, ) datasource_name = "my_{}_db".format(selected_database.value.lower()) if selected_database == SupportedDatabases.OTHER: datasource_name = "my_database" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) credentials = {} # Since we don't want to save the database credentials in the config file that will be # committed in the repo, we will use our Variable Substitution feature to store the credentials # in the credentials file (that will not be committed, since it is in the uncommitted directory) # with the datasource's name as the variable name. # The value of the datasource's "credentials" key in the config file (great_expectations.yml) will # be ${datasource name}. # Great Expectations will replace the ${datasource name} with the value from the credentials file in runtime. while True: cli_message(msg_db_config.format(datasource_name)) if selected_database == SupportedDatabases.MYSQL: if not _verify_mysql_dependent_modules(): return None credentials = _collect_mysql_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.POSTGRES: if not _verify_postgresql_dependent_modules(): return None credentials = _collect_postgres_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.REDSHIFT: if not _verify_redshift_dependent_modules(): return None credentials = _collect_redshift_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.SNOWFLAKE: if not _verify_snowflake_dependent_modules(): return None credentials = _collect_snowflake_credentials( default_credentials=credentials ) elif selected_database == SupportedDatabases.BIGQUERY: if not _verify_bigquery_dependent_modules(): return None credentials = _collect_bigquery_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.OTHER: sqlalchemy_url = click.prompt( """What is the url/connection string for the sqlalchemy connection? (reference: https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls) """, show_default=False, ).strip() credentials = {"url": sqlalchemy_url} context.save_config_variable(datasource_name, credentials) message = """ <red>Cannot connect to the database.</red> - Please check your environment and the configuration you provided. - Database Error: {0:s}""" try: cli_message( "<cyan>Attempting to connect to your database. This may take a moment...</cyan>" ) configuration = SqlAlchemyDatasource.build_configuration( credentials="${" + datasource_name + "}" ) configuration["class_name"] = "SqlAlchemyDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{0:s}' to your deployment, by adding this entry to your great_expectations.yml: {1:s} The credentials will be saved in uncommitted/config_variables.yml under the key '{0:s}' """.format( datasource_name, textwrap.indent( toolkit.yaml.dump({datasource_name: configuration}), " " ), ) ) toolkit.confirm_proceed_or_exit() context.add_datasource(name=datasource_name, **configuration) cli_message(msg_success_database) break except ModuleNotFoundError as de: cli_message(message.format(str(de))) return None except DatasourceInitializationError as de: cli_message(message.format(str(de))) if not click.confirm("Enter the credentials again?", default=True): context.add_datasource( datasource_name, initialize=False, module_name="great_expectations.datasource", class_name="SqlAlchemyDatasource", data_asset_type={"class_name": "SqlAlchemyDataset"}, credentials="${" + datasource_name + "}", ) # TODO this message about continuing may not be accurate cli_message( """ We saved datasource {:s} in {:s} and the credentials you entered in {:s}. Since we could not connect to the database, you can complete troubleshooting in the configuration files documented in the how-to guides here: <blue>https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_datasources.html?utm_source=cli&utm_medium=init&utm_campaign={:s}#{:s}</blue> . After you connect to the datasource, run great_expectations init to continue. """.format( datasource_name, DataContext.GE_YML, context.get_config()["config_variables_file_path"], rtd_url_ge_version, selected_database.value.lower(), ) ) return None return datasource_name
def _add_pandas_datasource( context, passthrough_generator_only=True, prompt_for_datasource_name=True ): toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "pandas"}, success=True, ) if passthrough_generator_only: datasource_name = "files_datasource" configuration = PandasDatasource.build_configuration() else: path = click.prompt( msg_prompt_filesys_enter_base_path, type=click.Path(exists=True, file_okay=False), ) if path.startswith("./"): path = path[2:] if path.endswith("/"): basenamepath = path[:-1] else: basenamepath = path datasource_name = os.path.basename(basenamepath) + "__dir" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) configuration = PandasDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } } ) configuration["class_name"] = "PandasDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml: {:s} """.format( datasource_name, textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), " "), ) ) toolkit.confirm_proceed_or_exit( continuation_message="Okay, exiting now. To learn more about adding datasources, run great_expectations " "datasource --help or visit https://docs.greatexpectations.io/" ) context.add_datasource(name=datasource_name, **configuration) return datasource_name
def get_compute_domain( self, domain_kwargs: dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple[pd.DataFrame, dict, dict]: """ Uses the given domain kwargs (which include row_condition, condition_parser, and ignore_row_if directives) to obtain and/or query a batch. Returns in the format of a Pandas DataFrame. If the domain is a single column, this is added to 'accessor domain kwargs' and used for later access Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "column", "column_pair", "table", and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: A tuple including: - a DataFrame (the data on which to compute) - a dictionary of compute_domain_kwargs, describing the DataFrame - a dictionary of accessor_domain_kwargs, describing any accessors needed to identify the domain within the compute domain """ data = self.get_domain_records(domain_kwargs=domain_kwargs, ) # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = {} table = domain_kwargs.get("table", None) if table: raise ValueError( "PandasExecutionEngine does not currently support multiple named tables." ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if (domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(list(accessor_keys)) > 0): logger.warning( 'Accessor keys ignored since Metric Domain Type is not "table"' ) # If given table (this is default), get all unexpected accessor_keys (an optional parameters allowing us to # modify domain access) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(list(accessor_keys)) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop( key) if len(compute_domain_kwargs.keys()) > 0: # Warn user if kwarg not "normal". unexpected_keys: set = set( compute_domain_kwargs.keys()).difference({ "batch_id", "table", "row_condition", "condition_parser", }) if len(unexpected_keys) > 0: unexpected_keys_str: str = ", ".join( map(lambda element: f'"{element}"', unexpected_keys)) logger.warning( f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".' ) return data, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN: if "column" not in compute_domain_kwargs: raise ge_exceptions.GreatExpectationsError( "Column not provided in compute_domain_kwargs") accessor_domain_kwargs["column"] = compute_domain_kwargs.pop( "column") elif domain_type == MetricDomainTypes.COLUMN_PAIR: if not ("column_A" in domain_kwargs and "column_B" in domain_kwargs): raise ge_exceptions.GreatExpectationsError( "column_A or column_B not found within domain_kwargs") accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B") elif domain_type == MetricDomainTypes.MULTICOLUMN: if "column_list" not in domain_kwargs: raise ge_exceptions.GreatExpectationsError( "column_list not found within domain_kwargs") column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise ge_exceptions.GreatExpectationsError( "column_list must contain at least 2 columns") accessor_domain_kwargs["column_list"] = column_list return data, compute_domain_kwargs, accessor_domain_kwargs