def test_from_pandas(): pd_df = pd.DataFrame({ "x": [1, 3, 5, 7, 9], "y": [2, 4, 6, 8, 10], "z": [None, "a", "b", "c", "abc"] }) ge_df = ge.from_pandas(pd_df) assert isinstance(ge_df, ge.data_asset.DataAsset) assert list(ge_df.columns) == ["x", "y", "z"] assert list(ge_df["x"]) == list(pd_df["x"]) assert list(ge_df["y"]) == list(pd_df["y"]) assert list(ge_df["z"]) == list(pd_df["z"]) # make an empty subclass to test dataset_class argument class CustomPandasDataset(ge.dataset.PandasDataset): pass ge_df_custom = ge.from_pandas(pd_df, dataset_class=CustomPandasDataset) assert not isinstance(ge_df, CustomPandasDataset) assert isinstance(ge_df_custom, CustomPandasDataset) assert list(ge_df_custom.columns) == ["x", "y", "z"] assert list(ge_df_custom["x"]) == list(pd_df["x"]) assert list(ge_df_custom["y"]) == list(pd_df["y"]) assert list(ge_df_custom["z"]) == list(pd_df["z"])
def test_from_pandas(): pd_df = pd.DataFrame({ 'x': [1, 3, 5, 7, 9], 'y': [2, 4, 6, 8, 10], 'z': [None, 'a', 'b', 'c', 'abc'] }) ge_df = ge.from_pandas(pd_df) assert isinstance(ge_df, ge.dataset.Dataset) assert list(ge_df.columns) == ['x', 'y', 'z'] assert list(ge_df['x']) == list(pd_df['x']) assert list(ge_df['y']) == list(pd_df['y']) assert list(ge_df['z']) == list(pd_df['z']) # make an empty subclass to test dataset_class argument class CustomPandasDataset(ge.dataset.PandasDataset): pass ge_df_custom = ge.from_pandas(pd_df, dataset_class=CustomPandasDataset) assert not isinstance(ge_df, CustomPandasDataset) assert isinstance(ge_df_custom, CustomPandasDataset) assert list(ge_df_custom.columns)==['x', 'y', 'z'] assert list(ge_df_custom['x'])==list(pd_df['x']) assert list(ge_df_custom['y'])==list(pd_df['y']) assert list(ge_df_custom['z'])==list(pd_df['z'])
def _get_expectations( self, field_mapping: FieldMapping, field_name: str ) -> ge.dataset.Dataset: field_mapping_ge = ge.from_pandas(field_mapping.get_field_mapping_df()) field_mapping_ge.set_default_expectation_argument("result_format", "COMPLETE") # Shape shape_expectation = field_mapping_ge.expect_table_columns_to_match_ordered_list( common.COLUMN_NAMES ) # If the shape isn't correct then the following expectations will raise exceptions if not shape_expectation["success"]: return field_mapping_ge field = self.table_schema.get_field(field_name) # Check that there aren't multiple mappings for the same input value field_mapping_ge.expect_column_values_to_be_unique(common.INPUT_COLUMN_NAME) # Check that the mapped values are all part of the available options (blank value is also valid) valid_mapped_values = self._get_valid_mappings_for_field(field) + [""] field_mapping_ge.expect_column_values_to_be_in_set( common.OUTPUT_COLUMN_NAME, valid_mapped_values ) # Check that the approved column values are all "Yes", "No", or blank (Assumed to be no) field_mapping_ge.expect_column_values_to_be_in_set( common.APPROVED_COLUMN_NAME, common.VALID_APPROVED_VALUES + ["None"] ) return field_mapping_ge
def _do_expectation(_info, df): ge_df = ge.from_pandas(df) ge_result = ge_callback(ge_df) check.invariant('success' in ge_result) return ExpectationResult( success=ge_result['success'], metadata_entries=[ EventMetadataEntry.json(label='result', data=ge_result) ], )
def _get_expectations(self, field_mapping: FieldMapping) -> ge.dataset.Dataset: field_mapping_ge = ge.from_pandas(field_mapping.get_field_mapping_df()) field_mapping_ge.set_default_expectation_argument("result_format", "COMPLETE") # Check that the approved column values are all "Yes" field_mapping_ge.expect_column_values_to_be_in_set( common.APPROVED_COLUMN_NAME, [common.APPROVED] ) return field_mapping_ge
def check_iris_data(df: DataFrame): gdf = ge.from_pandas(df) result = gdf.expect_column_values_to_be_in_set( "species", ["setosa", "virginica", "versicolor"], mostly=0.99 ) if not result.success: raise Exception("iris data is no good") return df
def validate_with_great_expectations( self, dataframe: pd.DataFrame, expectation_suite: TypeVar("ge.core.ExpectationSuite"), ge_validate_kwargs: Optional[Dict[Any, Any]] = {}, ): report = ge.from_pandas( dataframe, expectation_suite=expectation_suite ).validate(**ge_validate_kwargs) return report
def test_from_pandas(self): pd_df = pd.DataFrame({ 'x': [1, 3, 5, 7, 9], 'y': [2, 4, 6, 8, 10], 'z': [None, 'a', 'b', 'c', 'abc'] }) ge_df = ge.from_pandas(pd_df) self.assertIsInstance(ge_df, ge.dataset.Dataset) self.assertEquals(list(ge_df.columns), ['x', 'y', 'z']) self.assertEquals(list(ge_df['x']), list(pd_df['x'])) self.assertEquals(list(ge_df['y']), list(pd_df['y'])) self.assertEquals(list(ge_df['z']), list(pd_df['z']))
def sanity_check(date, bucket): s3 = s3fs.S3FileSystem() output = ge.from_pandas( pq.ParquetDataset("s3://" + bucket + "/usp/output/date=" + date, filesystem=s3).read_pandas().to_pandas()) output.expect_column_values_to_be_in_set('accommodation_ns', [100]) status = output.validate() print(status) if not status['success']: raise ValueError
def _file_passes(_info, df): with open(file_path) as ff: expt_config = json.load(ff) # This is necessary because ge ends up coercing a type change # on the same dataframe instance, changing the type. But a # ge can't be copied, because it causes an error. # The error is # AttributeError: 'PandasDataset' object has no attribute 'discard_subset_failing_expectations' # See https://github.com/great-expectations/great_expectations/issues/342 df_copy = copy.deepcopy(df) ge_df = ge.from_pandas(df_copy, expt_config) validate_result = ge_df.validate() check.invariant('success' in validate_result) check.invariant('results' in validate_result) return ExpectationResult(success=validate_result['success'], result_context=validate_result)
def data_qual(df: pd.DataFrame): """ Функция для тестирования данных в pipeline :param df: :return: """ df = ge.from_pandas(df) # создаем проверки result = df.expect_column_values_to_be_in_set('Subscriber_Type', list(df['Subscriber_Type'].unique()), mostly=.95) if not result['success']: err = result["exception_info"] raise Exception(f"You get unexpected data in Subscriber_Type column\n{err}")
def test_dataframe(df): """ Test the resulting dataframe to ensure data integrity. Can be expanded if further tests are needed. """ print("Performing final validation test on dataframe") ge_df = ge.from_pandas(df) # Check nulls columns_not_null = df.columns for column in columns_not_null: result = ge_df.expect_column_values_to_not_be_null(column) assert (result["success"] == True ), "Coluns contain null value. Please check data." # Ensure result is only between 0 and 1 result = ge_df.expect_column_values_to_be_between("result", min_value=0, max_value=1) assert result[ "success"] == True, "Results not between 0 and 1, please check data" # Make sure the gold delta does not have outliders gold_delta = [ "golddiffat10", "golddiffat15", "teamgolddiffat10", "teamgolddiffat15", ] for column in gold_delta: result = ge_df.expect_column_values_to_be_between(column, min_value=-20000, max_value=20000) assert result[ "success"] == True, "Gold diff values too high, please check data" print("Data validation passed.") return df
def test_failure_map_values_unique(self): dataset_ge = ge.from_pandas(TEST_DATA) dataset_ge.set_default_expectation_argument("result_format", "COMPLETE") dataset_ge.expect_column_values_to_be_unique("column_one") failure_map = common.ge_results_to_failure_map( {"dataset": dataset_ge.validate()}) self.assertEqual( { "dataset": { common.EXPECT_VALUES_UNIQUE_KEY: { common.COLUMN_NAME_KEY: "column_one", common.FAILED_VALUES_KEY: ["value1"], } } }, failure_map, )
def test_from_pandas_expectations_config(): # Logic mostly copied from TestValidation.test_validate def load_ge_config(file): with open(file) as f: return json.load(f) my_expectations_config = load_ge_config( "./tests/test_sets/titanic_expectations.json") pd_df = pd.read_csv("./tests/test_sets/Titanic.csv") my_df = ge.from_pandas(pd_df, expectations_config=my_expectations_config) my_df.set_default_expectation_argument("result_format", "COMPLETE") results = my_df.validate(catch_exceptions=False) expected_results = load_ge_config( "./tests/test_sets/expected_results_20180303.json") assertDeepAlmostEqual(results, expected_results)
def _get_shape_expectations(self, dataset: pd.DataFrame) -> ge.dataset.Dataset: """ Validates dataset shape. Validations: - Dataset columns names are a subset of mapped column names and table_schema field names """ dataset_ge = ge.from_pandas(dataset, dataset_class=ShapePandasDataset) dataset_ge.set_default_expectation_argument("result_format", "COMPLETE") valid_field_names: List[str] = table_schema.get_valid_field_names( self.table_schema, self.row_format) valid_cols: List[str] = list( self.column_mapping.keys()) + valid_field_names dataset_ge.expect_table_columns_to_be_in_set(valid_cols) dataset_ge.expect_named_cols() return dataset_ge
def test_failure_map_values_in_set(self): dataset_ge = ge.from_pandas(TEST_DATA) dataset_ge.set_default_expectation_argument("result_format", "COMPLETE") dataset_ge.expect_column_values_to_be_in_set( "column_two", set(["invalid_value", "row1_col2"])) failure_map = common.ge_results_to_failure_map( {"dataset": dataset_ge.validate()}) self.assertEqual( { "dataset": { common.EXPECT_VALUES_IN_SET_KEY: { common.COLUMN_NAME_KEY: "column_two", common.FAILED_VALUES_KEY: ["row2_col2"], } } }, failure_map, )
def test_failure_map_values(self): dataset_ge = ge.from_pandas(TEST_DATA, dataset_class=ShapePandasDataset) dataset_ge.set_default_expectation_argument("result_format", "COMPLETE") dataset_ge.expect_table_columns_to_be_in_set( set(["column_one", "other_column"])) failure_map = common.ge_results_to_failure_map( {"dataset": dataset_ge.validate()}) self.assertEqual( { "dataset": { common.EXPECT_COLUMNS_IN_SET_KEY: { common.FAILED_VALUES_KEY: ["column_two"] } } }, failure_map, )
def duplicate_and_obfuscuate(df): df_a = df.copy() df_b = df.copy() df_a["group"] = "a" df_b["group"] = "b" for column in df_b.columns: if column == "group": continue if df_b[column].dtype in ["int", "float"]: df_b[column] += np.max(df_b[column]) continue if df_b[column].dtype == "object" and all( [(type(elem) == str) or (elem is None) for elem in df_b[column]] ): df_b[column] = df_b[column].astype(str) + "__obfuscate" continue return ge.from_pandas(pd.concat([df_a, df_b], ignore_index=True, sort=False))
def __init__(self, df, meta_data): """ Takes a pandas dataframe and a table meta data object and checks the values in the dataframe against the meta data. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be a pandas dataframe object") self.meta_data = meta_data self.validate_meta_data() self.meta_cols = meta_data["columns"] # Placeholer for proper schema check if not isinstance(self.meta_cols, list): raise TypeError("meta_cols must be a list of objects") # This never fails, but the resultant types are not guaranteed to be correct df = impose_metadata_types_on_pd_df(df, meta_data) self.df_ge = ge.from_pandas(df) self.vlog = ValidationLog(self)
def _get_expectations(self, column_mapping: pd.DataFrame) -> ge.dataset.Dataset: """ Returns great_expectations object for a pd.DataFrame with expectations attached. If not all expectations have been satisfied, this function may fail early for readability of the failed expectation(s). Expectations: - Has columns matching exactly to COLUMN_NAMES - Does not have any repeated local column names - All supposed GII columns are valid ones """ column_mapping_ge = ge.from_pandas(column_mapping) column_mapping_ge.set_default_expectation_argument("result_format", "COMPLETE") # Shape shape_expectation = column_mapping_ge.expect_table_columns_to_match_ordered_list( COLUMN_NAMES ) # If the shape isn't correct then the following expectations will raise exceptions if not shape_expectation["success"]: return column_mapping_ge # Check that there are no repeats of an internal column name column_mapping_ge.expect_column_values_to_be_unique( INTERNAL_COLUMN_NAME_COLUMN_NAME ) # Check that all Mission Impact field names are valid valid_field_names = table_schema.get_valid_field_names( self.table_schema, self.row_format ) column_mapping_ge.expect_column_values_to_be_in_set( MI_FIELD_NAME_COLUMN_NAME, valid_field_names ) return column_mapping_ge
def test_failure_map_column_ordered_list(self): dataset_ge = ge.from_pandas(TEST_DATA) dataset_ge.set_default_expectation_argument("result_format", "COMPLETE") dataset_ge.expect_table_columns_to_match_ordered_list( ["column_one", "other_column"]) failure_map = common.ge_results_to_failure_map( {"dataset": dataset_ge.validate()}) self.assertEqual( { "dataset": { common.EXPECT_COLUMNS_MATCH_KEY: { common.EXPECTED_ORDERED_LIST_KEY: [ "column_one", "other_column", ], common.FAILED_VALUES_KEY: ["column_two"], } } }, failure_map, )
def _do_expectation(_info, df): ge_df = ge.from_pandas(df) ge_result = ge_callback(ge_df) check.invariant('success' in ge_result) return ExpectationResult(success=ge_result['success'], result_context=ge_result)
RVMS_Current_Budgeted_CPU['ModelName'] = RVMS_Current_Budgeted_CPU.apply( getModelName, axis='columns') RVMS_Current_Budgeted_CPU['DestCode'] = RVMS_Current_Budgeted_CPU.apply( getDestCode, axis='columns') # %% RVMS_Current_Budgeted_CPU.head() # %% [markdown] # ### Perform data validation checks using Great Expectations library # %% [markdown] # #### Create Great Expectations dataframe from pandas dataframe: # %% ge_df = ge.from_pandas(RVMS_Current_Budgeted_CPU) # %% [markdown] # #### Check Model Years are between 1994 and 2099 # %% if ge_df.expect_column_values_to_be_between(column="ModelYear", min_value='1994', max_value='2099')['success']: print('Passed Model Year Check') else: print('FAILED Model Year Check') toaster = ToastNotifier() toaster.show_toast("### Check Status ###", "FAILED Model Year Check", icon_path="images/honda_logo.ico",
def test_errors_warnings_validation_operator_run_slack_query( basic_data_context_config_for_validation_operator, tmp_path_factory, filesystem_csv_4): ##### ##### # # WARNING: PY2 SUPPORT IS UNTESTED BECAUSE OF DICTIONARY ORDER ISSUES NOT YET RESOLVED # ##### ##### if PY2: pytest.skip( "skipping test_errors_warnings_validation_operator_run_slack_query in py2" ) project_path = str(tmp_path_factory.mktemp('great_expectations')) # NOTE: This setup is almost identical to test_DefaultDataContextAwareValidationOperator. # Consider converting to a single fixture. data_context = ConfigOnlyDataContext( basic_data_context_config_for_validation_operator, project_path, ) data_context.add_datasource("my_datasource", class_name="PandasDatasource", base_directory=str(filesystem_csv_4)) data_context.create_expectation_suite( data_asset_name="my_datasource/default/f1", expectation_suite_name="failure") df = data_context.get_batch("my_datasource/default/f1", "failure", batch_kwargs=data_context.yield_batch_kwargs( "my_datasource/default/f1")) df.expect_column_values_to_be_between(column="x", min_value=1, max_value=9) failure_expectations = df.get_expectation_suite( discard_failed_expectations=False) data_context.save_expectation_suite( failure_expectations, data_asset_name="my_datasource/default/f1", expectation_suite_name="failure") data_context.create_expectation_suite( data_asset_name="my_datasource/default/f1", expectation_suite_name="warning") df = data_context.get_batch("my_datasource/default/f1", "warning", batch_kwargs=data_context.yield_batch_kwargs( "my_datasource/default/f1")) df.expect_column_values_to_be_between(column="x", min_value=1, max_value=9) df.expect_column_values_to_not_be_null(column="y") warning_expectations = df.get_expectation_suite( discard_failed_expectations=False) data_context.save_expectation_suite( warning_expectations, data_asset_name="my_datasource/default/f1", expectation_suite_name="warning") data_context.save_expectation_suite( failure_expectations, data_asset_name="my_datasource/default/f2", expectation_suite_name="failure") data_context.save_expectation_suite( failure_expectations, data_asset_name="my_datasource/default/f3", expectation_suite_name="failure") data_context.save_expectation_suite( warning_expectations, data_asset_name="my_datasource/default/f2", expectation_suite_name="warning") data_context.save_expectation_suite( warning_expectations, data_asset_name="my_datasource/default/f3", expectation_suite_name="warning") vo = WarningAndFailureExpectationSuitesValidationOperator( data_context=data_context, action_list=[], slack_webhook="https://hooks.slack.com/services/test/slack/webhook") my_df_1 = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 4, None]}) my_ge_df_1 = ge.from_pandas(my_df_1) my_ge_df_1._expectation_suite["data_asset_name"] = DataAssetIdentifier( "my_datasource", "default", "f1") my_df_2 = pd.DataFrame({"x": [1, 2, 3, 4, 99], "y": [1, 2, 3, 4, 5]}) my_ge_df_2 = ge.from_pandas(my_df_2) my_ge_df_2._expectation_suite["data_asset_name"] = DataAssetIdentifier( "my_datasource", "default", "f2") my_df_3 = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 4, 5]}) my_ge_df_3 = ge.from_pandas(my_df_3) my_ge_df_3._expectation_suite["data_asset_name"] = DataAssetIdentifier( "my_datasource", "default", "f3") return_obj = vo.run( assets_to_validate=[my_ge_df_1, my_ge_df_2, my_ge_df_3], run_id="test_100") slack_query = vo._build_slack_query(return_obj) expected_slack_query = { 'blocks': [{ 'type': 'divider' }, { 'type': 'section', 'text': { 'type': 'mrkdwn', 'text': '*FailureVsWarning Validation Operator Completed.*' } }, { 'type': 'divider' }, { 'type': 'section', 'text': { 'type': 'mrkdwn', 'text': '*Status*: Failed :x:' } }, { 'type': 'section', 'text': { 'type': 'mrkdwn', 'text': '*Data Asset List:* [my_datasource/default/f1, my_datasource/default/f2, my_datasource/default/f3]' } }, { 'type': 'section', 'text': { '' 'type': 'mrkdwn', 'text': '*Failed Data Assets:* [my_datasource/default/f2]' } }, { 'type': 'section', 'text': { 'type': 'mrkdwn', 'text': '*Run ID:* test_100' } }, { 'type': 'section', 'text': { 'type': 'mrkdwn', 'text': '*Timestamp:* 09/26/2019 13:42:41' } }, { 'type': 'divider' }, { 'type': 'context', 'elements': [{ 'type': 'mrkdwn', 'text': 'Learn about FailureVsWarning Validation Operators at https://docs.greatexpectations.io/en/latest/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.html' }] }] } # We're okay with system variation in locales (OS X likes 24 hour, but not Travis) slack_query['blocks'][7]['text']['text'] = \ slack_query['blocks'][7]['text']['text'].replace('09/26/2019 13:42:41', 'LOCALEDATE') slack_query['blocks'][7]['text']['text'] = \ slack_query['blocks'][7]['text']['text'].replace('09/26/2019 01:42:41 PM', 'LOCALEDATE') expected_slack_query['blocks'][7]['text']['text'] = \ expected_slack_query['blocks'][7]['text']['text'].replace('09/26/2019 13:42:41', 'LOCALEDATE') expected_slack_query['blocks'][7]['text']['text'] = \ expected_slack_query['blocks'][7]['text']['text'].replace('09/26/2019 01:42:41 PM', 'LOCALEDATE') import json print(json.dumps(slack_query, indent=2)) print(json.dumps(expected_slack_query, indent=2)) assert slack_query == expected_slack_query
import pandas from pandas_profiling import ProfileReport import great_expectations as ge from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler if __name__ == "__main__": with mlflow.start_run(run_name="check_verify_data") as run: mlflow.set_tag("mlflow.runName", "check_verify_data") df = pandas.read_csv("./data/raw/data.csv") describe_to_dict = df.describe().to_dict() mlflow.log_dict(describe_to_dict, "describe_data.json") pd_df_ge = ge.from_pandas(df) assert pd_df_ge.expect_column_values_to_match_strftime_format( "Date", "%Y-%m-%d").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "High", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Low", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Open", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Close", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Volume", "long").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Adj Close", "float").success == True
def _get_batch_kwargs(df: pd.DataFrame) -> dict: dataset = ge.from_pandas(df) return {"dataset": dataset, "datasource": "pandas"}