def test_catch_exceptions_with_bad_expectation_type():
    # We want to catch degenerate cases where an expectation suite is incompatible with
    my_df = PandasDataset({"x": range(10)})
    my_df._expectation_suite.append_expectation(
        ExpectationConfiguration(expectation_type="foobar", kwargs={})
    )
    result = my_df.validate(catch_exceptions=True)

    # Find the foobar result
    idx = 0
    for idx, val_result in enumerate(result.results):
        if val_result.expectation_config.expectation_type == "foobar":
            break

    assert result.results[idx].success is False
    assert result.results[idx].expectation_config.expectation_type == "foobar"
    assert result.results[idx].expectation_config.kwargs == {}
    assert result.results[idx].exception_info["raised_exception"] is True
    assert (
        "AttributeError: 'PandasDataset' object has no attribute 'foobar'"
        in result.results[idx].exception_info["exception_traceback"]
    )

    with pytest.raises(AttributeError):
        result = my_df.validate(catch_exceptions=False)
Esempio n. 2
0
    def calculate_integrity(self, df_ge: PandasDataset,
                            specs: SchemaParserResult) -> dict:
        """
        Calculates the integrity from the defined types and the expectations.
        """
        def get_unexpected(eg_result):
            return eg_result[
                'unexpected_count'] if 'unexpected_count' in eg_result else 0

        def merge_dicts(d1, d2):
            for key, value in d2.items():
                for inner_value in d2[key]:
                    d1[key].append(inner_value)
            return d1

        all_elements = defaultdict(list)
        invalid_elements = defaultdict(list)
        for definition in specs.type_definitions:
            result = df_ge.expect_column_to_exist(definition)
            if not result.success:
                if definition in specs.required_types:  # does only count as error if required
                    invalid_elements[definition].append(df_ge.shape[0])
                    all_elements[definition].append(df_ge.shape[0])
                continue

            # check missing values
            result = df_ge.expect_column_values_to_not_be_null(definition)
            if definition in specs.required_types:  # only count as error if required
                invalid_elements[definition].append(
                    get_unexpected(result.result))
            all_elements[definition].append(result.result['element_count'])

            # check not correct types
            type_specification = TypeSpecification.create(
                specs.type_definitions.get(definition))
            type_list = [t.__name__ for t in type_specification.get_types()]
            # noinspection PyTypeChecker
            result = df_ge.expect_column_values_to_be_in_type_list(
                definition, type_list)
            invalid_elements[definition].append(get_unexpected(result.result))

        # handle attributes that are not specified
        not_specified_fields = set(df_ge.columns) - set(specs.type_definitions)
        if len(not_specified_fields) > 0:
            for attribute in not_specified_fields:
                result = df_ge.expect_column_values_to_be_null(attribute)
                # integrity of not specified fields has been defined as 1 - so we add 0 to unexpected
                invalid_elements[attribute].append(0)
                all_elements[attribute].append(get_unexpected(result.result))

        # check expectations
        expectation_violations = self.validate_expectations(df_ge, specs)
        merge_dicts(invalid_elements, expectation_violations)

        # flatten attribute metrics
        integrity_details = dict()
        for k, v in invalid_elements.items():
            integrity_details[k] = 1 - (np.sum(v) / sum(all_elements[k]))

        return integrity_details
    def test_pandas_column_map_decorator_partial_exception_counts(self):
        df = PandasDataset({'a': [0, 1, 2, 3, 4]})
        out = df.expect_column_values_to_be_between(
            'a',
            3,
            4,
            result_format={
                'result_format': 'COMPLETE',
                'partial_unexpected_count': 1
            })

        self.assertTrue(1, len(out['result']['partial_unexpected_counts']))
        self.assertTrue(3, len(out['result']['unexpected_list']))
def test_pandas_column_map_decorator_partial_exception_counts():
    df = PandasDataset({"a": [0, 1, 2, 3, 4]})
    out = df.expect_column_values_to_be_between(
        "a",
        3,
        4,
        result_format={
            "result_format": "COMPLETE",
            "partial_unexpected_count": 1
        },
    )

    assert 1 == len(out.result["partial_unexpected_counts"])
    assert 3 == len(out.result["unexpected_list"])
Esempio n. 5
0
def credit_profiler(ds: PandasDataset) -> ExpectationSuite:
    # simple checks on data consistency
    ds.expect_column_values_to_be_between(
        "credit_card_due", min_value=0, mostly=0.99,  # allow some outliers
    )

    ds.expect_column_values_to_be_between(
        "missed_payments_1y",
        min_value=0,
        max_value=5,
        mostly=0.99,  # allow some outliers
    )

    return ds.get_expectation_suite()
Esempio n. 6
0
def get_dataset(dataset_type, data):
    """For Pandas, data should be either a DataFrame or a dictionary that can be instantiated as a DataFrame
    For SQL, data should have the following shape:
        {
            'table':
                'table': SqlAlchemy Table object
                named_column: [list of values]
        }

    """
    if dataset_type == 'PandasDataset':
        return PandasDataset(data)
    elif dataset_type == 'SqlAlchemyDataset':
        # Create a new database

        engine = create_engine('sqlite://')

        # Add the data to the database as a new table
        df = pd.DataFrame(data)
        df.to_sql(name='test_data', con=engine, index=False)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset('test_data', engine=engine)
    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Esempio n. 7
0
    def validate_expectations(cls, df_ge: PandasDataset,
                              specs: SchemaParserResult) -> defaultdict[list]:
        """
        Validates the dynamic expectations from the schema via the
        great expectations library.
        """
        invalid_elements = defaultdict(list)
        suite = ExpectationSuite(
            expectation_suite_name="custom_specifications")
        for column in specs.expectation_definitions.keys():
            for expectation in specs.expectation_definitions[column]:
                kwargs_extended = dict(expectation['kwargs'])
                kwargs_extended['column'] = column
                suite.append_expectation(
                    ExpectationConfiguration(
                        expectation_type=expectation['expectation_type'],
                        kwargs=kwargs_extended))
        # noinspection PyTypeChecker
        result = df_ge.validate(expectation_suite=suite, result_format="BASIC")
        for expectation_result in result.results:
            if expectation_result.exception_info['raised_exception']:
                continue
            column_name = expectation_result.expectation_config.kwargs[
                "column"]
            n_invalid = expectation_result.result['unexpected_count']
            invalid_elements[column_name].append(n_invalid)

        return invalid_elements
Esempio n. 8
0
def _prepare_dataset(dataset: PandasDataset) -> PandasDataset:
    dataset_copy = dataset.copy(deep=True)

    for column in dataset.columns:
        if dataset.expect_column_values_to_be_in_type_list(
            column, type_list=sorted(list(ProfilerTypeMapping.DATETIME_TYPE_NAMES))
        ).success:
            # GE cannot parse Timestamp or other pandas datetime time
            dataset_copy[column] = dataset[column].dt.strftime("%Y-%m-%dT%H:%M:%S")

        if dataset[column].dtype == np.float32:
            # GE converts expectation arguments into native Python float
            # This could cause error on comparison => so better to convert to double prematurely
            dataset_copy[column] = dataset[column].astype(np.float64)

    return dataset_copy
Esempio n. 9
0
def nulls_dataset():
    df = pd.DataFrame({
        "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)],
        "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)],
    })
    batch_df = PandasDataset(df)

    return batch_df
Esempio n. 10
0
    def udf(df: pd.DataFrame) -> pd.Series:
        from datadog.dogstatsd import DogStatsd

        reporter = (DogStatsd(
            host=os.environ["STATSD_HOST"],
            port=int(os.environ["STATSD_PORT"]),
            telemetry_min_flush_interval=0,
        ) if os.getenv("STATSD_HOST") and os.getenv("STATSD_PORT") else
                    DogStatsd())

        ds = PandasDataset.from_dataset(df)
        result = ds.validate(expectations, result_format="COMPLETE")
        valid_rows = pd.Series([True] * df.shape[0])

        for check in result.results:
            if check.exception_info["raised_exception"]:
                # ToDo: probably we should mark all rows as invalid
                continue

            check_kwargs = check.expectation_config.kwargs
            check_kwargs.pop("result_format", None)
            check_name = "_".join([check.expectation_config.expectation_type] +
                                  [
                                      str(v) for v in check_kwargs.values()
                                      if isinstance(v, (str, int, float))
                                  ])

            if ("unexpected_count" in check.result
                    and check.result["unexpected_count"] > 0):
                reporter.increment(
                    "feast_feature_validation_check_failed",
                    value=check.result["unexpected_count"],
                    tags=[
                        f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}",
                        f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}",
                        f"check:{check_name}",
                    ],
                )

                valid_rows.iloc[check.result["unexpected_index_list"]] = False

            elif "observed_value" in check.result and check.result[
                    "observed_value"]:
                reporter.gauge(
                    "feast_feature_validation_observed_value",
                    value=int(check.result["observed_value"] *
                              100  # storing as decimal with precision 2
                              ) if not check.success else
                    0,  # nullify everything below threshold
                    tags=[
                        f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}",
                        f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}",
                        f"check:{check_name}",
                    ],
                )

        return valid_rows
Esempio n. 11
0
def test_dataset_from_pandas_source(tmpdir):
    data_file = tmpdir + '/data.json'
    json_data = [
        {"name": "my name", "birthdate": "2020-10-01", "address": "1234 Main st", "size": 12},
        {"name": "your name", "birthdate": "2020-06-01", "address": "1313 Mockingbird Ln",
         "size": 12}
    ]
    with open(data_file, mode='w') as out:
        json.dump(json_data, out)

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    pd_dataset = PandasDataset(pandas.read_json(data_file),
                               **{'batch_kwargs': {'path': 'gcs://my_bucket/path/to/my/data'},
                                  'data_context': ctx})
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')

    datasets = action._fetch_datasets_from_pandas_source(pd_dataset,
                                                         validation_result_suite=result_suite)
    assert len(datasets) == 1
    input_ds = datasets[0]
    assert input_ds.name == '/path/to/my/data'
    assert input_ds.namespace == "gcs://my_bucket"

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "gcs://my_bucket"
    assert input_ds.facets["dataSource"].uri == 'gcs://my_bucket'

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'object'),
                         SchemaField('birthdate', 'object'),
                         SchemaField('address', 'object'),
                         SchemaField('size', 'int64')])

    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])
Esempio n. 12
0
def create_suite():
    df = pd.DataFrame()
    df['num'] = np.random.randint(0, 10, 100)
    df['num2'] = np.random.randint(0, 20, 100)
    ds = PandasDataset.from_dataset(df)

    ds.expect_column_values_to_be_between('num', 0, 10)
    ds.expect_column_values_to_be_between('num2', 0, 20)

    return ds.get_expectation_suite()
Esempio n. 13
0
def profiler_with_unrealistic_expectations(
        dataset: PandasDataset) -> ExpectationSuite:
    # need to create dataframe with corrupted data first
    df = pd.DataFrame()
    df["current_balance"] = [-100]
    df["avg_passenger_count"] = [0]

    other_ds = PandasDataset(df)
    other_ds.expect_column_max_to_be_between("current_balance", -1000, -100)
    other_ds.expect_column_values_to_be_in_set("avg_passenger_count",
                                               value_set={0})

    # this should pass
    other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000)

    return other_ds.get_expectation_suite()
Esempio n. 14
0
    def validate(df) -> pd.DataFrame:
        ds = PandasDataset.from_dataset(df)
        # print(ds, ds.shape)
        result = ds.validate(suite, result_format='COMPLETE')
        valid_rows = pd.Series([True] * ds.shape[0])
        # print(result)
        for check in result.results:
            if check.success:
                continue

            valid_rows.iloc[check.result['unexpected_index_list']] = False
        return valid_rows
Esempio n. 15
0
def cardinality_dataset():
    df = pd.DataFrame({
        "col_none": [None for i in range(0, 1000)],
        "col_one": [0 for i in range(0, 1000)],
        "col_two": [i % 2 for i in range(0, 1000)],
        "col_very_few": [i % 10 for i in range(0, 1000)],
        "col_few": [i % 50 for i in range(0, 1000)],
        "col_many": [i % 100 for i in range(0, 1000)],
        "col_very_many": [i % 500 for i in range(0, 1000)],
        "col_unique": [i for i in range(0, 1000)],
    })
    batch_df = PandasDataset(df)

    return batch_df
Esempio n. 16
0
    def analyze_dataset(self, df: pd.DataFrame) -> Profile:
        """
        Generate GEProfile with ExpectationSuite (set of expectations)
        from a given pandas dataframe by applying user defined profiler.

        Some fixes are also applied to the dataset (see _prepare_dataset function) to make it compatible with GE.

        Return GEProfile
        """
        dataset = PandasDataset(df)

        dataset = _prepare_dataset(dataset)

        return GEProfile(expectation_suite=self.user_defined_profiler(dataset))
Esempio n. 17
0
    def udf(df: pd.DataFrame) -> pd.Series:
        ds = PandasDataset.from_dataset(df)
        result = ds.validate(expectations, result_format="COMPLETE")
        valid_rows = pd.Series([True] * df.shape[0])

        for check in result.results:
            if check.success:
                continue

            if check.exception_info["raised_exception"]:
                # ToDo: probably we should mark all rows as invalid
                continue

            valid_rows.iloc[check.result["unexpected_index_list"]] = False

        return valid_rows
Esempio n. 18
0
    def validate(self, df: pd.DataFrame) -> "GEValidationReport":
        """
        Validate provided dataframe against GE expectation suite.
        1. Pandas dataframe is converted into PandasDataset (GE type)
        2. Some fixes applied to the data to avoid crashes inside GE (see _prepare_dataset)
        3. Each expectation from ExpectationSuite instance tested against resulting dataset

        Return GEValidationReport, which parses great expectation's schema into list of generic ValidationErrors.
        """
        dataset = PandasDataset(df)

        dataset = _prepare_dataset(dataset)

        results = ge.validate(
            dataset, expectation_suite=self.expectation_suite, result_format="COMPLETE"
        )
        return GEValidationReport(results)
Esempio n. 19
0
    def compare_attributes_with_schema(
            self, samples: list,
            schema_definition: SchemaDefinition) -> (dict, dict):
        """
        Calculates attribute integrity and specification of the samples.
        """
        specs = self.parse_schema(schema_definition)

        df_normalized = json_normalize(samples, sep="/")
        df_ge = PandasDataset(df_normalized)

        # calculate integrity
        integrity_details = self.calculate_integrity(df_ge, specs)

        # calculate specification
        specification_details = self.calculate_specification(df_ge, specs)

        return integrity_details, specification_details
def test_config_with_not_null_only(possible_expectations_set):
    """
    What does this test do and why?
    Confirms that the not_null_only key in config works as expected.
    """

    excluded_expectations = [i for i in possible_expectations_set if "null" not in i]

    df = pd.DataFrame(
        {
            "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)],
            "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)],
        }
    )
    batch_df = PandasDataset(df)

    profiler_without_not_null_only = UserConfigurableProfiler(
        batch_df, excluded_expectations, not_null_only=False
    )
    suite_without_not_null_only = profiler_without_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        suite_without_not_null_only
    )
    assert expectations == {
        "expect_column_values_to_be_null",
        "expect_column_values_to_not_be_null",
    }

    profiler_with_not_null_only = UserConfigurableProfiler(
        batch_df, excluded_expectations, not_null_only=True
    )
    not_null_only_suite = profiler_with_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        not_null_only_suite
    )
    assert expectations == {"expect_column_values_to_not_be_null"}

    no_config_profiler = UserConfigurableProfiler(batch_df)
    no_config_suite = no_config_profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite)
    assert "expect_column_values_to_be_null" in expectations
Esempio n. 21
0
def test_pandas_datetime_evaluation_parameter():
    evaluation_parameters = {
        "now": pd.Timestamp.now(),
        "now_minus_48h": pd.Timestamp.now() - pd.to_timedelta(2, unit="d"),
    }

    test_data = {
        "data_refresh": [
            pd.Timestamp.now(),
            (pd.Timestamp.now() - pd.to_timedelta(1, unit="d")),
        ]
    }
    _df = pd.DataFrame(test_data)
    df = PandasDataset(_df)

    for param in evaluation_parameters:
        df.set_evaluation_parameter(param, evaluation_parameters[param])
    df.expect_column_max_to_be_between(
        column="data_refresh", min_value={"$PARAMETER": "now_minus_48h"})

    result = df.validate()

    assert result.success
Esempio n. 22
0
def get_dataset(dataset_type,
                data,
                schemas=None,
                profiler=ColumnsExistProfiler,
                caching=True):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == 'PandasDataset':
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        from sqlalchemy import create_engine
        engine = create_engine('sqlite://')
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if schemas and "sqlite" in schemas and isinstance(
                engine.dialect, sqlitetypes.dialect):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == 'postgresql':
        from sqlalchemy import create_engine
        # Create a new database
        engine = create_engine('postgresql://postgres@localhost/test_ci')
        conn = engine.connect()

        sql_dtypes = {}
        if schemas and "postgresql" in schemas and isinstance(
                engine.dialect, postgresqltypes.dialect):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == 'mysql':
        engine = create_engine('mysql://root@localhost/test_ci')
        conn = engine.connect()

        sql_dtypes = {}
        if schemas and "mysql" in schemas and isinstance(
                engine.dialect, mysqltypes.dialect):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == 'SparkDFDataset':
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and 'spark' in schemas:
            schema = schemas['spark']
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Esempio n. 23
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="validation_test",
        entities=["key"],
        features=[
            Feature("num", ValueType.INT64),
            Feature("set", ValueType.STRING)
        ],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_test:num", "validation_test:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_test:num", "validation_test:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_test:num",
            "set": "validation_test:set"
        }),
    )
Esempio n. 24
0
def get_dataset(dataset_type,
                data,
                schemas=None,
                autoinspect_func=autoinspect.columns_exist,
                caching=False):
    """For Pandas, data should be either a DataFrame or a dictionary that can
    be instantiated as a DataFrame.
    For SQL, data should have the following shape:
        {
            'table':
                'table': SqlAlchemy Table object
                named_column: [list of values]
        }

    """
    if dataset_type == 'PandasDataset':
        df = pd.DataFrame(data)
        if schemas and "pandas" in schemas:
            pandas_schema = {
                key: np.dtype(value)
                for (key, value) in schemas["pandas"].items()
            }
            df = df.astype(pandas_schema)
        return PandasDataset(df,
                             autoinspect_func=autoinspect_func,
                             caching=caching)
    elif dataset_type == 'SqlAlchemyDataset':
        # Create a new database

        # Try to use a local postgres instance (e.g. on Travis); this will allow more testing than sqlite
        try:
            engine = create_engine('postgresql://*****:*****@localhost/test_ci')
            conn = engine.connect()
        except SQLAlchemyError:
            warnings.warn("Falling back to sqlite database.")
            engine = create_engine('sqlite://')
            conn = engine.connect()

        # Add the data to the database as a new table
        df = pd.DataFrame(data)

        sql_dtypes = {}
        if schemas and "sqlite" in schemas and isinstance(
                engine.dialect, sqlitetypes.dialect):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type = schema[col]
                if type == "int":
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type == "float":
                    df[col] = pd.to_numeric(df[col], downcast='float')
                elif type == "datetime":
                    df[col] = pd.to_datetime(df[col])
        elif schemas and "postgresql" in schemas and isinstance(
                engine.dialect, postgresqltypes.dialect):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type = schema[col]
                if type == "int":
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type == "float":
                    df[col] = pd.to_numeric(df[col], downcast='float')
                elif type == "timestamp":
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 autoinspect_func=autoinspect_func,
                                 caching=caching)

    elif dataset_type == 'SparkDFDataset':
        spark = SparkSession.builder.getOrCreate()
        data_reshaped = list(zip(*[v for _, v in data.items()]))
        if schemas and 'spark' in schemas:
            schema = schemas['spark']
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]]())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Esempio n. 25
0
def get_dataset(
    dataset_type,
    data,
    schemas=None,
    profiler=ColumnsExistProfiler,
    caching=True,
    table_name=None,
    sqlite_db_path=None,
):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == "PandasDataset":
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        if not create_engine:
            return None

        if sqlite_db_path is not None:
            engine = create_engine(f"sqlite:////{sqlite_db_path}")
        else:
            engine = create_engine("sqlite://")
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if (schemas and "sqlite" in schemas
                and isinstance(engine.dialect, sqlitetypes.dialect)):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "postgresql":
        if not create_engine:
            return None

        # Create a new database
        engine = create_engine("postgresql://postgres@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "postgresql" in schemas
                and isinstance(engine.dialect, postgresqltypes.dialect)):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mysql":
        if not create_engine:
            return None

        engine = create_engine("mysql+pymysql://root@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "mysql" in schemas
                and isinstance(engine.dialect, mysqltypes.dialect)):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mssql":
        if not create_engine:
            return None

        engine = create_engine(
            "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true",
            # echo=True,
        )

        # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit"
        # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful).
        # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True))

        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and dataset_type in schemas
                and isinstance(engine.dialect, mssqltypes.dialect)):
            schema = schemas[dataset_type]
            sql_dtypes = {
                col: MSSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "SparkDFDataset":
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType,
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and "spark" in schemas:
            schema = schemas["spark"]
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Esempio n. 26
0
def titanic_dataset():
    df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv"))
    batch_df = PandasDataset(df)

    return batch_df
Esempio n. 27
0
def test_that_ge_pandas_datasets_are_memory_efficient(csvpath: Path):
    df = pd.read_csv(str(csvpath))
    df_ge = PandasDataset(df)
    bequals = df.values.base == df_ge.values.base
    assert bool(bequals.all())
Esempio n. 28
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_ge:num", "validation_ge:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_ge:num", "validation_ge:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_ge:num",
            "set": "validation_ge:set"
        }),
    )
Esempio n. 29
0
def test_validation_reports_metrics(feast_client: Client, kafka_server,
                                    statsd_server: StatsDServer):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge_metrics")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=10)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    unexpected_counts = {
        "expect_column_values_to_be_between_num_0_100":
        validation_result.results[0].result["unexpected_count"],
        "expect_column_values_to_be_in_set_set":
        validation_result.results[1].result["unexpected_count"],
    }
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=[
                "validation_ge_metrics:num", "validation_ge_metrics:set"
            ],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    expected_metrics = [(
        f"feast_feature_validation_check_failed#check:{check_name},"
        f"feature_table:{feature_table.name},project:{feast_client.project}",
        value,
    ) for check_name, value in unexpected_counts.items()]
    wait_retry_backoff(
        lambda: (
            None,
            all(
                statsd_server.metrics.get(m) == v
                for m, v in expected_metrics),
        ),
        timeout_secs=30,
        timeout_msg="Expected metrics were not received: " +
        str(expected_metrics) + "\n"
        "Actual received metrics" + str(statsd_server.metrics),
    )