Example #1
0
def test_historical_features_with_missing_request_data(environment,
                                                       universal_data_sources,
                                                       full_feature_names):
    store = environment.feature_store

    (_, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])

    # If request data is missing that's needed for on demand transform, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=datasets.entity_df,
            features=[
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "conv_rate_plus_100:conv_rate_plus_100",
                "conv_rate_plus_100:conv_rate_plus_val_to_add",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
                "field_mapping:feature_name",
            ],
            full_feature_names=full_feature_names,
        )
Example #2
0
def test_push_features_and_read(environment, universal_data_sources):
    store = environment.feature_store

    (_, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])
    data = {
        "location_id": [1],
        "temperature": [4],
        "event_timestamp":
        [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
        "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
    }
    df_ingest = pd.DataFrame(data)

    store.push("location_stats_push_source", df_ingest)

    online_resp = store.get_online_features(
        features=["pushable_location_stats:temperature"],
        entity_rows=[{
            "location_id": 1
        }],
    )
    online_resp_dict = online_resp.to_dict()
    assert online_resp_dict["location_id"] == [1]
    assert online_resp_dict["temperature"] == [4]
Example #3
0
def test_feature_get_online_features_types_match(online_types_test_fixtures):
    environment, config, data_source, fv = online_types_test_fixtures
    fv = create_feature_view(config.feature_dtype, config.feature_is_list,
                             data_source)
    fs = environment.feature_store
    features = [fv.name + ":value"]
    entity = driver(value_type=ValueType.UNKNOWN)
    fs.apply([fv, entity])
    fs.materialize(environment.start_date, environment.end_date)

    driver_id_value = "1" if config.entity_type == ValueType.STRING else 1
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver": driver_id_value
        }],
    ).to_dict()

    feature_list_dtype_to_expected_online_response_value_type = {
        "int32": "int",
        "int64": "int",
        "float": "float",
        "string": "str",
        "bool": "bool",
    }
    if config.feature_is_list:
        assert type(online_features["value"][0]).__name__ == "list"
        assert (type(online_features["value"][0][0]).__name__ ==
                feature_list_dtype_to_expected_online_response_value_type[
                    config.feature_dtype])
    else:
        assert (type(online_features["value"][0]).__name__ ==
                feature_list_dtype_to_expected_online_response_value_type[
                    config.feature_dtype])
Example #4
0
def test_feature_get_online_features_types_match(online_types_test_fixtures):
    environment, config, data_source, fv = online_types_test_fixtures
    fv = create_feature_view(config.feature_dtype, config.feature_is_list,
                             config.has_empty_list, data_source)
    fs = environment.feature_store
    features = [fv.name + ":value"]
    entity = driver(value_type=ValueType.UNKNOWN)
    fs.apply([fv, entity])
    fs.materialize(environment.start_date, environment.end_date)

    driver_id_value = "1" if config.entity_type == ValueType.STRING else 1
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver": driver_id_value
        }],
    ).to_dict()

    feature_list_dtype_to_expected_online_response_value_type = {
        "int32": int,
        "int64": int,
        "float": float,
        "string": str,
        "bool": bool,
    }
    expected_dtype = feature_list_dtype_to_expected_online_response_value_type[
        config.feature_dtype]
    if config.feature_is_list:
        for feature in online_features["value"]:
            assert isinstance(feature, list)
            for element in feature:
                assert isinstance(element, expected_dtype)
    else:
        for feature in online_features["value"]:
            assert isinstance(feature, expected_dtype)
Example #5
0
def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources):
    config = IntegrationTestRepoConfig(
        offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator"
    )
    import os

    if "AWS_ACCESS_KEY_ID" in os.environ:
        raise Exception(
            "AWS_ACCESS_KEY_ID has already been set in the environment. Setting it again may cause a conflict. "
            "It may be better to deduplicate AWS configuration or use sub-processes for isolation"
        )

    os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"

    with construct_test_environment(config) as environment:
        fs = environment.feature_store

        entities, datasets, data_sources = universal_data_sources
        feature_views = construct_universal_feature_views(data_sources)

        feast_objects = []
        feast_objects.extend(feature_views.values())
        feast_objects.extend([driver(), customer()])
        fs.apply(feast_objects)
        fs.materialize(environment.start_date, environment.end_date)

        out = fs.get_online_features(
            features=["driver_stats:conv_rate"], entity_rows=[{"driver": 5001}]
        ).to_dict()
        assert out["conv_rate"][0] is not None

    del os.environ["AWS_ACCESS_KEY_ID"]
    del os.environ["AWS_SECRET_ACCESS_KEY"]
Example #6
0
def test_historical_retrieval_with_validation(environment,
                                              universal_data_sources):
    store = environment.feature_store
    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)
    store.apply([driver(), customer(), location(), *feature_views.values()])

    # Create two identical retrieval jobs
    entity_df = datasets.entity_df.drop(
        columns=["order_id", "origin_id", "destination_id"])
    reference_job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )
    job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )

    # Save dataset using reference job and retrieve it
    store.create_saved_dataset(
        from_=reference_job,
        name="my_training_dataset",
        storage=environment.data_source_creator.
        create_saved_dataset_destination(),
    )
    saved_dataset = store.get_saved_dataset("my_training_dataset")

    # If validation pass there will be no exceptions on this point
    reference = saved_dataset.as_reference(profiler=configurable_profiler)
    job.to_df(validation_reference=reference)
Example #7
0
def test_online_retrieval_with_event_timestamps(environment,
                                                universal_data_sources,
                                                full_feature_names):
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    fs.apply([driver(), feature_views.driver, feature_views.global_fv])

    # fake data to ingest into Online Store
    data = {
        "driver_id": [1, 2],
        "conv_rate": [0.5, 0.3],
        "acc_rate": [0.6, 0.4],
        "avg_daily_trips": [4, 5],
        "event_timestamp": [
            pd.to_datetime(1646263500, utc=True, unit="s"),
            pd.to_datetime(1646263600, utc=True, unit="s"),
        ],
        "created": [
            pd.to_datetime(1646263500, unit="s"),
            pd.to_datetime(1646263600, unit="s"),
        ],
    }
    df_ingest = pd.DataFrame(data)

    # directly ingest data into the Online Store
    fs.write_to_online_store("driver_stats", df_ingest)

    response = fs.get_online_features(
        features=[
            "driver_stats:avg_daily_trips",
            "driver_stats:acc_rate",
            "driver_stats:conv_rate",
        ],
        entity_rows=[{
            "driver_id": 1
        }, {
            "driver_id": 2
        }],
    )
    df = response.to_df(True)
    assertpy.assert_that(len(df)).is_equal_to(2)
    assertpy.assert_that(df["driver_id"].iloc[0]).is_equal_to(1)
    assertpy.assert_that(df["driver_id"].iloc[1]).is_equal_to(2)
    assertpy.assert_that(df["avg_daily_trips" +
                            TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500)
    assertpy.assert_that(df["avg_daily_trips" +
                            TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600)
    assertpy.assert_that(df["acc_rate" +
                            TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500)
    assertpy.assert_that(df["acc_rate" +
                            TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600)
    assertpy.assert_that(df["conv_rate" +
                            TIMESTAMP_POSTFIX].iloc[0]).is_equal_to(1646263500)
    assertpy.assert_that(df["conv_rate" +
                            TIMESTAMP_POSTFIX].iloc[1]).is_equal_to(1646263600)
Example #8
0
def test_e2e_consistency(environment, e2e_data_sources, infer_features):
    fs = environment.feature_store
    df, data_source = e2e_data_sources
    fv = driver_feature_view(data_source=data_source, infer_features=infer_features)

    entity = driver()
    fs.apply([fv, entity])

    run_offline_online_store_consistency_test(fs, fv)
def test_online_retrieval(environment, universal_data_sources, benchmark):

    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            feature_views["driver_odfv"]
        ],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([driver(), customer(), location(), feature_service])
    fs.apply(feast_objects)
    fs.materialize(environment.start_date, environment.end_date)

    sample_drivers = random.sample(entities["driver"], 10)

    sample_customers = random.sample(entities["customer"], 10)

    entity_rows = [{
        "driver": d,
        "customer_id": c,
        "val_to_add": 50
    } for (d, c) in zip(sample_drivers, sample_customers)]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
    ]
    unprefixed_feature_refs = [
        f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f
    ]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    benchmark(
        fs.get_online_features,
        features=feature_refs,
        entity_rows=entity_rows,
    )
Example #10
0
def test_feature_get_historical_features_types_match(
        offline_types_test_fixtures):
    """
    Note: to make sure this test works, we need to ensure that get_historical_features
    returns at least one non-null row to make sure type inferral works. This can only
    be achieved by carefully matching entity_df to the data fixtures.
    """
    environment, config, data_source, fv = offline_types_test_fixtures
    fs = environment.feature_store
    entity = driver()
    fv = create_feature_view(
        "get_historical_features_types_match",
        config.feature_dtype,
        config.feature_is_list,
        config.has_empty_list,
        data_source,
    )
    fs.apply([fv, entity])

    entity_df = pd.DataFrame()
    entity_df["driver_id"] = (["1", "3"] if config.entity_type
                              == ValueType.STRING else [1, 3])
    ts = pd.Timestamp(datetime.utcnow()).round("ms")
    entity_df["ts"] = [
        ts - timedelta(hours=4),
        ts - timedelta(hours=2),
    ]
    features = [f"{fv.name}:value"]

    historical_features = fs.get_historical_features(
        entity_df=entity_df,
        features=features,
    )
    # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs
    historical_features_df = historical_features.to_df()
    print(historical_features_df)

    if config.feature_is_list:
        assert_feature_list_types(
            environment.test_repo_config.provider,
            config.feature_dtype,
            historical_features_df,
        )
    else:
        assert_expected_historical_feature_types(config.feature_dtype,
                                                 historical_features_df)
    assert_expected_arrow_types(
        environment.test_repo_config.provider,
        config.feature_dtype,
        config.feature_is_list,
        historical_features,
    )
Example #11
0
def test_feature_get_online_features_types_match(online_types_test_fixtures):
    environment, config, data_source, fv = online_types_test_fixtures
    fv = create_feature_view(
        "get_online_features_types_match",
        config.feature_dtype,
        config.feature_is_list,
        config.has_empty_list,
        data_source,
    )
    fs = environment.feature_store
    features = [fv.name + ":value"]
    entity = driver(value_type=config.entity_type)
    fs.apply([fv, entity])
    fs.materialize(
        environment.start_date,
        environment.end_date -
        timedelta(hours=1)  # throwing out last record to make sure
        # we can successfully infer type even from all empty values
    )

    driver_id_value = "1" if config.entity_type == ValueType.STRING else 1
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver_id": driver_id_value
        }],
    ).to_dict()

    feature_list_dtype_to_expected_online_response_value_type = {
        "int32": int,
        "int64": int,
        "float": float,
        "string": str,
        "bool": bool,
        "datetime": datetime,
    }
    expected_dtype = feature_list_dtype_to_expected_online_response_value_type[
        config.feature_dtype]

    assert len(online_features["value"]) == 1

    if config.feature_is_list:
        for feature in online_features["value"]:
            assert isinstance(feature, list), "Feature value should be a list"
            assert (config.has_empty_list
                    or len(feature) > 0), "List of values should not be empty"
            for element in feature:
                assert isinstance(element, expected_dtype)
    else:
        for feature in online_features["value"]:
            assert isinstance(feature, expected_dtype)
Example #12
0
def test_feature_get_historical_features_types_match(
        offline_types_test_fixtures):
    environment, config, data_source, fv = offline_types_test_fixtures
    fs = environment.feature_store
    fv = create_feature_view(
        "get_historical_features_types_match",
        config.feature_dtype,
        config.feature_is_list,
        config.has_empty_list,
        data_source,
    )
    entity = driver()
    fs.apply([fv, entity])

    features = [f"{fv.name}:value"]
    entity_df = pd.DataFrame()
    entity_df["driver_id"] = (["1", "3"] if config.entity_type
                              == ValueType.STRING else [1, 3])
    now = datetime.utcnow()
    ts = pd.Timestamp(now).round("ms")
    entity_df["ts"] = [
        ts - timedelta(hours=4),
        ts - timedelta(hours=2),
    ]
    historical_features = fs.get_historical_features(
        entity_df=entity_df,
        features=features,
    )
    # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs
    historical_features_df = historical_features.to_df()
    print(historical_features_df)

    if config.feature_is_list:
        assert_feature_list_types(
            environment.test_repo_config.provider,
            config.feature_dtype,
            historical_features_df,
        )
    else:
        assert_expected_historical_feature_types(config.feature_dtype,
                                                 historical_features_df)
    assert_expected_arrow_types(
        environment.test_repo_config.provider,
        config.feature_dtype,
        config.feature_is_list,
        historical_features,
    )
Example #13
0
def test_e2e_consistency(environment, e2e_data_sources, infer_features):
    fs = environment.feature_store
    df, data_source = e2e_data_sources
    fv = driver_feature_view(
        name=f"test_consistency_{'with_inference' if infer_features else ''}",
        data_source=data_source,
        infer_features=infer_features,
    )

    entity = driver()
    fs.apply([fv, entity])

    # materialization is run in two steps and
    # we use timestamp from generated dataframe as a split point
    split_dt = df["ts_1"][4].to_pydatetime() - timedelta(seconds=1)

    run_offline_online_store_consistency_test(fs, fv, split_dt)
Example #14
0
def test_infer_odfv_features_with_error(environment, universal_data_sources):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    features = [Field(name="conv_rate_plus_200", dtype=Float64)]
    driver_hourly_stats = create_driver_hourly_stats_batch_feature_view(
        data_sources.driver)
    request_source = create_conv_rate_request_source()
    driver_odfv = conv_rate_plus_100_feature_view(
        [driver_hourly_stats, request_source],
        features=features,
    )

    feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()]
    with pytest.raises(SpecifiedFeaturesNotPresentError):
        store.apply(feast_objects)
def test_infer_odfv_features_with_error(environment, universal_data_sources):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    features = [Feature("conv_rate_plus_200", ValueType.DOUBLE)]
    driver_hourly_stats = create_driver_hourly_stats_feature_view(
        data_sources["driver"]
    )
    request_data_source = create_conv_rate_request_data_source()
    driver_odfv = conv_rate_plus_100_feature_view(
        {"driver": driver_hourly_stats, "input_request": request_data_source},
        features=features,
    )

    feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()]
    with pytest.raises(SpecifiedFeaturesNotPresentError):
        store.apply(feast_objects)
def test_infer_odfv_features(environment, universal_data_sources, infer_features):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    driver_hourly_stats = create_driver_hourly_stats_feature_view(
        data_sources["driver"]
    )
    request_data_source = create_conv_rate_request_data_source()
    driver_odfv = conv_rate_plus_100_feature_view(
        {"driver": driver_hourly_stats, "input_request": request_data_source},
        infer_features=infer_features,
    )

    feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()]
    store.apply(feast_objects)
    odfv = store.get_on_demand_feature_view("conv_rate_plus_100")
    assert len(odfv.features) == 2
def setup_python_fs_client():
    config = IntegrationTestRepoConfig()
    environment = construct_test_environment(config)
    fs = environment.feature_store
    try:
        entities, datasets, data_sources = construct_universal_test_data(
            environment)
        feature_views = construct_universal_feature_views(data_sources)
        feast_objects: List[FeastObject] = []
        feast_objects.extend(feature_views.values())
        feast_objects.extend([driver(), customer(), location()])
        fs.apply(feast_objects)
        fs.materialize(environment.start_date, environment.end_date)
        client = TestClient(get_app(fs))
        yield client
    finally:
        fs.teardown()
        environment.data_source_creator.teardown()
Example #18
0
def test_historical_retrieval_fails_on_validation(environment,
                                                  universal_data_sources):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])

    entity_df = datasets.entity_df.drop(
        columns=["order_id", "origin_id", "destination_id"])

    reference_job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )

    store.create_saved_dataset(
        from_=reference_job,
        name="my_other_dataset",
        storage=environment.data_source_creator.
        create_saved_dataset_destination(),
    )

    job = store.get_historical_features(
        entity_df=entity_df,
        features=_features,
    )

    with pytest.raises(ValidationFailed) as exc_info:
        job.to_df(validation_reference=store.get_saved_dataset(
            "my_other_dataset").as_reference(
                profiler=profiler_with_unrealistic_expectations))

    failed_expectations = exc_info.value.report.errors
    assert len(failed_expectations) == 2

    assert failed_expectations[
        0].check_name == "expect_column_max_to_be_between"
    assert failed_expectations[0].column_name == "current_balance"

    assert failed_expectations[
        1].check_name == "expect_column_values_to_be_in_set"
    assert failed_expectations[1].column_name == "avg_passenger_count"
Example #19
0
def test_entity_inference_types_match(offline_types_test_fixtures):
    environment, config, data_source, fv = offline_types_test_fixtures
    fs = environment.feature_store

    # Don't specify value type in entity to force inference
    entity = driver(value_type=ValueType.UNKNOWN)
    fs.apply([fv, entity])

    entities = fs.list_entities()
    entity_type_to_expected_inferred_entity_type = {
        ValueType.INT32: ValueType.INT64,
        ValueType.INT64: ValueType.INT64,
        ValueType.FLOAT: ValueType.DOUBLE,
        ValueType.STRING: ValueType.STRING,
    }
    for entity in entities:
        assert (
            entity.value_type == entity_type_to_expected_inferred_entity_type[
                config.entity_type])
Example #20
0
def test_write_to_online_store(environment, universal_data_sources):
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    driver_hourly_stats = create_driver_hourly_stats_feature_view(
        data_sources.driver)
    driver_entity = driver()

    # Register Feature View and Entity
    fs.apply([driver_hourly_stats, driver_entity])

    # fake data to ingest into Online Store
    data = {
        "driver_id": [123],
        "conv_rate": [0.85],
        "acc_rate": [0.91],
        "avg_daily_trips": [14],
        "event_timestamp":
        [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
        "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
    }
    df_data = pd.DataFrame(data)

    # directly ingest data into the Online Store
    fs.write_to_online_store("driver_stats", df_data)

    # assert the right data is in the Online Store
    df = fs.get_online_features(
        features=[
            "driver_stats:avg_daily_trips",
            "driver_stats:acc_rate",
            "driver_stats:conv_rate",
        ],
        entity_rows=[{
            "driver_id": 123
        }],
    ).to_df()
    assertpy.assert_that(df["avg_daily_trips"].iloc[0]).is_equal_to(14)
    assertpy.assert_that(df["acc_rate"].iloc[0]).is_close_to(0.91, 1e-6)
    assertpy.assert_that(df["conv_rate"].iloc[0]).is_close_to(0.85, 1e-6)
Example #21
0
def construct_test_environment(
    test_repo_config: TestRepoConfig,
    create_and_apply: bool = False,
    materialize: bool = False,
) -> Environment:
    """
    This method should take in the parameters from the test repo config and created a feature repo, apply it,
    and return the constructed feature store object to callers.

    This feature store object can be interacted for the purposes of tests.
    The user is *not* expected to perform any clean up actions.

    :param test_repo_config: configuration
    :return: A feature store built using the supplied configuration.
    """
    df = create_dataset()

    project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}"

    module_name, config_class_name = test_repo_config.offline_store_creator.rsplit(
        ".", 1)

    offline_creator: DataSourceCreator = importer.get_class_from_type(
        module_name, config_class_name, "DataSourceCreator")(project)
    ds = offline_creator.create_data_source(project,
                                            df,
                                            field_mapping={
                                                "ts_1": "ts",
                                                "id": "driver_id"
                                            })
    offline_store = offline_creator.create_offline_store_config()
    online_store = test_repo_config.online_store

    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=project,
            provider=test_repo_config.provider,
            offline_store=offline_store,
            online_store=online_store,
            repo_path=repo_dir_name,
        )
        fs = FeatureStore(config=config)
        environment = Environment(
            name=project,
            test_repo_config=test_repo_config,
            feature_store=fs,
            data_source=ds,
            data_source_creator=offline_creator,
        )

        fvs = []
        entities = []
        try:
            if create_and_apply:
                entities.extend([driver(), customer()])
                fvs.extend([
                    environment.driver_stats_feature_view(),
                    environment.customer_feature_view(),
                ])
                fs.apply(fvs + entities)

            if materialize:
                fs.materialize(environment.start_date, environment.end_date)

            yield environment
        finally:
            offline_creator.teardown()
            fs.teardown()
Example #22
0
def test_online_store_cleanup(environment, universal_data_sources):
    """
    Some online store implementations (like Redis) keep features from different features views
    but with common entities together.
    This might end up with deletion of all features attached to the entity,
    when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150).

    Plan:
        1. Register two feature views with common entity "driver"
        2. Materialize data
        3. Check if features are available (via online retrieval)
        4. Delete one feature view
        5. Check that features for other are still available
        6. Delete another feature view (and create again)
        7. Verify that features for both feature view were deleted
    """
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    driver_stats_fv = construct_universal_feature_views(data_sources).driver

    driver_entities = entities.driver_vals
    df = pd.DataFrame({
        "ts_1": [environment.end_date] * len(driver_entities),
        "created_ts": [environment.end_date] * len(driver_entities),
        "driver_id":
        driver_entities,
        "value":
        np.random.random(size=len(driver_entities)),
    })

    ds = environment.data_source_creator.create_data_source(
        df, destination_name="simple_driver_dataset")

    simple_driver_fv = driver_feature_view(
        data_source=ds, name="test_universal_online_simple_driver")

    fs.apply([driver(), simple_driver_fv, driver_stats_fv])

    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )
    expected_values = df.sort_values(by="driver_id")

    features = [f"{simple_driver_fv.name}:value"]
    entity_rows = [{
        "driver_id": driver_id
    } for driver_id in sorted(driver_entities)]

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert np.allclose(expected_values["value"], online_features["value"])

    fs.apply(objects=[simple_driver_fv],
             objects_to_delete=[driver_stats_fv],
             partial=False)

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert np.allclose(expected_values["value"], online_features["value"])

    fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False)

    def eventually_apply() -> Tuple[None, bool]:
        try:
            fs.apply([simple_driver_fv])
        except BotoCoreError:
            return None, False

        return None, True

    # Online store backend might have eventual consistency in schema update
    # So recreating table that was just deleted might need some retries
    wait_retry_backoff(eventually_apply, timeout_secs=60)

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert all(v is None for v in online_features["value"])
Example #23
0
def test_online_retrieval(environment, universal_data_sources,
                          full_feature_names):
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[
            feature_views.driver[["conv_rate"]],
            feature_views.driver_odfv,
            feature_views.customer[["current_balance"]],
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views.location.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views.location.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}),
        ],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
    ])
    fs.apply(feast_objects)
    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )

    entity_sample = datasets.orders_df.sample(10)[[
        "customer_id", "driver_id", "order_id", "event_timestamp"
    ]]
    orders_df = datasets.orders_df[(
        datasets.orders_df["customer_id"].isin(entity_sample["customer_id"])
        & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]))]

    sample_drivers = entity_sample["driver_id"]
    drivers_df = datasets.driver_df[datasets.driver_df["driver_id"].isin(
        sample_drivers)]

    sample_customers = entity_sample["customer_id"]
    customers_df = datasets.customer_df[
        datasets.customer_df["customer_id"].isin(sample_customers)]

    location_pairs = np.array(
        list(itertools.permutations(entities.location_vals, 2)))
    sample_location_pairs = location_pairs[np.random.choice(
        len(location_pairs), 10)].T.tolist()
    origins_df = datasets.location_df[datasets.location_df["location_id"].isin(
        sample_location_pairs[0])]
    destinations_df = datasets.location_df[
        datasets.location_df["location_id"].isin(sample_location_pairs[1])]

    global_df = datasets.global_df

    entity_rows = [{
        "driver_id": d,
        "customer_id": c,
        "val_to_add": 50
    } for (d, c) in zip(sample_drivers, sample_customers)]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "order:order_is_success",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
    ]
    unprefixed_feature_refs = [
        f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f
    ]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    online_features_dict = get_online_features_dict(
        environment=environment,
        features=feature_refs,
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )

    # Test that the on demand feature views compute properly even if the dependent conv_rate
    # feature isn't requested.
    online_features_no_conv_rate = get_online_features_dict(
        environment=environment,
        features=[
            ref for ref in feature_refs if ref != "driver_stats:conv_rate"
        ],
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )

    assert online_features_no_conv_rate is not None

    keys = set(online_features_dict.keys())
    expected_keys = set(
        f.replace(":", "__") if full_feature_names else f.split(":")[-1]
        for f in feature_refs) | {"customer_id", "driver_id"}
    assert (
        keys == expected_keys
    ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)"

    tc = unittest.TestCase()
    for i, entity_row in enumerate(entity_rows):
        df_features = get_latest_feature_values_from_dataframes(
            driver_df=drivers_df,
            customer_df=customers_df,
            orders_df=orders_df,
            global_df=global_df,
            entity_row=entity_row,
        )

        assert df_features["customer_id"] == online_features_dict[
            "customer_id"][i]
        assert df_features["driver_id"] == online_features_dict["driver_id"][i]
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name("conv_rate_plus_100",
                                                       feature_refs,
                                                       full_feature_names)][i],
            df_features["conv_rate"] + 100,
            delta=0.0001,
        )
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name(
                "conv_rate_plus_val_to_add", feature_refs,
                full_feature_names)][i],
            df_features["conv_rate"] + df_features["val_to_add"],
            delta=0.0001,
        )
        for unprefixed_feature_ref in unprefixed_feature_refs:
            tc.assertAlmostEqual(
                df_features[unprefixed_feature_ref],
                online_features_dict[response_feature_name(
                    unprefixed_feature_ref, feature_refs,
                    full_feature_names)][i],
                delta=0.0001,
            )

    # Check what happens for missing values
    missing_responses_dict = get_online_features_dict(
        environment=environment,
        features=feature_refs,
        entity_rows=[{
            "driver_id": 0,
            "customer_id": 0,
            "val_to_add": 100
        }],
        full_feature_names=full_feature_names,
    )
    assert missing_responses_dict is not None
    for unprefixed_feature_ref in unprefixed_feature_refs:
        if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}:
            tc.assertIsNone(missing_responses_dict[response_feature_name(
                unprefixed_feature_ref, feature_refs, full_feature_names)][0])

    # Check what happens for missing request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        get_online_features_dict(
            environment=environment,
            features=feature_refs,
            entity_rows=[{
                "driver_id": 0,
                "customer_id": 0
            }],
            full_feature_names=full_feature_names,
        )

    assert_feature_service_correctness(
        environment,
        feature_service,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        global_df,
    )

    entity_rows = [{
        "origin_id": origin,
        "destination_id": destination
    } for (_driver, _customer, origin, destination
           ) in zip(sample_drivers, sample_customers, *sample_location_pairs)]
    assert_feature_service_entity_mapping_correctness(
        environment,
        feature_service_entity_mapping,
        entity_rows,
        full_feature_names,
        origins_df,
        destinations_df,
    )
Example #24
0
def test_entity_ttl_online_store(local_redis_environment,
                                 redis_universal_data_sources):
    if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True":
        return
    fs = local_redis_environment.feature_store
    # setting ttl setting in online store to 1 second
    fs.config.online_store.key_ttl_seconds = 1
    entities, datasets, data_sources = redis_universal_data_sources
    driver_hourly_stats = create_driver_hourly_stats_feature_view(
        data_sources.driver)
    driver_entity = driver()

    # Register Feature View and Entity
    fs.apply([driver_hourly_stats, driver_entity])

    # fake data to ingest into Online Store
    data = {
        "driver_id": [1],
        "conv_rate": [0.5],
        "acc_rate": [0.6],
        "avg_daily_trips": [4],
        "event_timestamp":
        [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
        "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
    }
    df_ingest = pd.DataFrame(data)

    # directly ingest data into the Online Store
    fs.write_to_online_store("driver_stats", df_ingest)

    # assert the right data is in the Online Store
    df = fs.get_online_features(
        features=[
            "driver_stats:avg_daily_trips",
            "driver_stats:acc_rate",
            "driver_stats:conv_rate",
        ],
        entity_rows=[{
            "driver_id": 1
        }],
    ).to_df()
    assertpy.assert_that(df["avg_daily_trips"].iloc[0]).is_equal_to(4)
    assertpy.assert_that(df["acc_rate"].iloc[0]).is_close_to(0.6, 1e-6)
    assertpy.assert_that(df["conv_rate"].iloc[0]).is_close_to(0.5, 1e-6)

    # simulate time passing for testing ttl
    time.sleep(1)

    # retrieve the same entity again
    df = fs.get_online_features(
        features=[
            "driver_stats:avg_daily_trips",
            "driver_stats:acc_rate",
            "driver_stats:conv_rate",
        ],
        entity_rows=[{
            "driver_id": 1
        }],
    ).to_df()
    # assert that the entity features expired in the online store
    assertpy.assert_that(df["avg_daily_trips"].iloc[0]).is_none()
    assertpy.assert_that(df["acc_rate"].iloc[0]).is_none()
    assertpy.assert_that(df["conv_rate"].iloc[0]).is_none()
Example #25
0
def test_online_retrieval(environment, universal_data_sources, full_feature_names):

    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[feature_views["driver"][["conv_rate"]], feature_views["driver_odfv"]],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([driver(), customer(), feature_service])
    fs.apply(feast_objects)
    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )

    entity_sample = datasets["orders"].sample(10)[
        ["customer_id", "driver_id", "order_id", "event_timestamp"]
    ]
    orders_df = datasets["orders"][
        (
            datasets["orders"]["customer_id"].isin(entity_sample["customer_id"])
            & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"])
        )
    ]

    sample_drivers = entity_sample["driver_id"]
    drivers_df = datasets["driver"][
        datasets["driver"]["driver_id"].isin(sample_drivers)
    ]

    sample_customers = entity_sample["customer_id"]
    customers_df = datasets["customer"][
        datasets["customer"]["customer_id"].isin(sample_customers)
    ]

    global_df = datasets["global"]

    entity_rows = [
        {"driver": d, "customer_id": c, "val_to_add": 50}
        for (d, c) in zip(sample_drivers, sample_customers)
    ]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "order:order_is_success",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
    ]
    unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    online_features = fs.get_online_features(
        features=feature_refs,
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )
    assert online_features is not None

    online_features_dict = online_features.to_dict()
    keys = online_features_dict.keys()
    assert (
        len(keys) == len(feature_refs) + 3
    )  # Add three for the driver id and the customer id entity keys + val_to_add request data.
    for feature in feature_refs:
        if full_feature_names:
            assert feature.replace(":", "__") in keys
        else:
            assert feature.rsplit(":", 1)[-1] in keys
            assert (
                "driver_stats" not in keys
                and "customer_profile" not in keys
                and "order" not in keys
                and "global_stats" not in keys
            )

    tc = unittest.TestCase()
    for i, entity_row in enumerate(entity_rows):
        df_features = get_latest_feature_values_from_dataframes(
            drivers_df, customers_df, orders_df, global_df, entity_row
        )

        assert df_features["customer_id"] == online_features_dict["customer_id"][i]
        assert df_features["driver_id"] == online_features_dict["driver_id"][i]
        assert (
            online_features_dict[
                response_feature_name("conv_rate_plus_100", full_feature_names)
            ][i]
            == df_features["conv_rate"] + 100
        )
        assert (
            online_features_dict[
                response_feature_name("conv_rate_plus_val_to_add", full_feature_names)
            ][i]
            == df_features["conv_rate"] + df_features["val_to_add"]
        )
        for unprefixed_feature_ref in unprefixed_feature_refs:
            tc.assertEqual(
                df_features[unprefixed_feature_ref],
                online_features_dict[
                    response_feature_name(unprefixed_feature_ref, full_feature_names)
                ][i],
            )

    # Check what happens for missing values
    missing_responses_dict = fs.get_online_features(
        features=feature_refs,
        entity_rows=[{"driver": 0, "customer_id": 0, "val_to_add": 100}],
        full_feature_names=full_feature_names,
    ).to_dict()
    assert missing_responses_dict is not None
    for unprefixed_feature_ref in unprefixed_feature_refs:
        if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}:
            tc.assertIsNone(
                missing_responses_dict[
                    response_feature_name(unprefixed_feature_ref, full_feature_names)
                ][0]
            )

    # Check what happens for missing request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        fs.get_online_features(
            features=feature_refs,
            entity_rows=[{"driver": 0, "customer_id": 0}],
            full_feature_names=full_feature_names,
        ).to_dict()

    assert_feature_service_correctness(
        fs,
        feature_service,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        global_df,
    )
Example #26
0
def test_historical_features_persisting(environment, universal_data_sources,
                                        full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    store.apply([driver(), customer(), location(), *feature_views.values()])

    entity_df = datasets.entity_df.drop(
        columns=["order_id", "origin_id", "destination_id"])

    job = store.get_historical_features(
        entity_df=entity_df,
        features=[
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    saved_dataset = store.create_saved_dataset(
        from_=job,
        name="saved_dataset",
        storage=environment.data_source_creator.
        create_saved_dataset_destination(),
        tags={"env": "test"},
    )

    event_timestamp = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
    expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        entity_df,
        event_timestamp,
        full_feature_names,
    ).drop(columns=[
        response_feature_name("conv_rate_plus_100", full_feature_names),
        response_feature_name("conv_rate_plus_100_rounded",
                              full_feature_names),
        response_feature_name("avg_daily_trips", full_feature_names),
        response_feature_name("conv_rate", full_feature_names),
        "origin__temperature",
        "destination__temperature",
    ])

    assert_frame_equal(
        expected_df,
        saved_dataset.to_df(),
        keys=[event_timestamp, "driver_id", "customer_id"],
    )

    assert_frame_equal(
        job.to_df(),
        saved_dataset.to_df(),
        keys=[event_timestamp, "driver_id", "customer_id"],
    )
Example #27
0
def test_historical_features_with_entities_from_query(environment,
                                                      universal_data_sources,
                                                      full_feature_names):
    store = environment.feature_store
    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    orders_table = table_name_from_data_source(data_sources.orders)
    if not orders_table:
        raise pytest.skip("Offline source is not sql-based")

    data_source_creator = environment.test_repo_config.offline_store_creator
    if data_source_creator.__name__ == SnowflakeDataSourceCreator.__name__:
        entity_df_query = f"""
        SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp"
        FROM "{orders_table}"
        """
    else:
        entity_df_query = f"""
        SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp
        FROM {orders_table}
        """

    store.apply([driver(), customer(), location(), *feature_views.values()])

    job_from_sql = store.get_historical_features(
        entity_df=entity_df_query,
        features=[
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_sql_entities = job_from_sql.to_df()
    end_time = datetime.utcnow()
    print(
        str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'"
            ))

    event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       in datasets.orders_df.columns else "e_ts")
    full_expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        datasets.entity_df,
        event_timestamp,
        full_feature_names,
    )

    # Not requesting the on demand transform with an entity_df query (can't add request data in them)
    expected_df_query = full_expected_df.drop(columns=[
        response_feature_name("conv_rate_plus_100", full_feature_names),
        response_feature_name("conv_rate_plus_100_rounded",
                              full_feature_names),
        response_feature_name("avg_daily_trips", full_feature_names),
        response_feature_name("conv_rate", full_feature_names),
        "origin__temperature",
        "destination__temperature",
    ])
    assert_frame_equal(
        expected_df_query,
        actual_df_from_sql_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )

    table_from_sql_entities = job_from_sql.to_arrow().to_pandas()
    for col in table_from_sql_entities.columns:
        expected_df_query[col] = expected_df_query[col].astype(
            table_from_sql_entities[col].dtype)

    assert_frame_equal(
        expected_df_query,
        table_from_sql_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )
Example #28
0
def test_historical_features(environment, universal_data_sources, full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    customer_df, driver_df, location_df, orders_df, global_df, entity_df = (
        datasets["customer"],
        datasets["driver"],
        datasets["location"],
        datasets["orders"],
        datasets["global"],
        datasets["entity"],
    )
    entity_df_with_request_data = entity_df.copy(deep=True)
    entity_df_with_request_data["val_to_add"] = [
        i for i in range(len(entity_df_with_request_data))
    ]
    entity_df_with_request_data["driver_age"] = [
        i + 100 for i in range(len(entity_df_with_request_data))
    ]

    (
        customer_fv,
        driver_fv,
        driver_odfv,
        location_fv,
        order_fv,
        global_fv,
        driver_age_request_fv,
    ) = (
        feature_views["customer"],
        feature_views["driver"],
        feature_views["driver_odfv"],
        feature_views["location"],
        feature_views["order"],
        feature_views["global"],
        feature_views["driver_age_request_fv"],
    )

    feature_service = FeatureService(
        name="convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            driver_odfv,
            driver_age_request_fv,
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            location_fv.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}
            ),
            location_fv.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}
            ),
        ],
    )

    feast_objects = []
    feast_objects.extend(
        [
            customer_fv,
            driver_fv,
            driver_odfv,
            location_fv,
            order_fv,
            global_fv,
            driver_age_request_fv,
            driver(),
            customer(),
            location(),
            feature_service,
            feature_service_entity_mapping,
        ]
    )
    store.apply(feast_objects)

    entity_df_query = None
    orders_table = table_name_from_data_source(data_sources["orders"])
    if orders_table:
        entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}"

    event_timestamp = (
        DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
        if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns
        else "e_ts"
    )
    full_expected_df = get_expected_training_df(
        customer_df,
        customer_fv,
        driver_df,
        driver_fv,
        orders_df,
        order_fv,
        location_df,
        location_fv,
        global_df,
        global_fv,
        entity_df_with_request_data,
        event_timestamp,
        full_feature_names,
    )

    # Only need the shadow entities features in the FeatureService test
    expected_df = full_expected_df.drop(
        columns=["origin__temperature", "destination__temperature"],
    )

    if entity_df_query:
        job_from_sql = store.get_historical_features(
            entity_df=entity_df_query,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "order:order_is_success",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )

        start_time = datetime.utcnow()
        actual_df_from_sql_entities = job_from_sql.to_df()
        end_time = datetime.utcnow()
        print(
            str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'")
        )

        # Not requesting the on demand transform with an entity_df query (can't add request data in them)
        expected_df_query = expected_df.drop(
            columns=[
                "conv_rate_plus_100",
                "conv_rate_plus_100_rounded",
                "val_to_add",
                "conv_rate_plus_val_to_add",
                "driver_age",
            ]
        )
        assert sorted(expected_df_query.columns) == sorted(
            actual_df_from_sql_entities.columns
        )

        actual_df_from_sql_entities = (
            actual_df_from_sql_entities[expected_df_query.columns]
            .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
            .drop_duplicates()
            .reset_index(drop=True)
        )
        expected_df_query = (
            expected_df_query.sort_values(
                by=[event_timestamp, "order_id", "driver_id", "customer_id"]
            )
            .drop_duplicates()
            .reset_index(drop=True)
        )

        assert_frame_equal(
            actual_df_from_sql_entities, expected_df_query, check_dtype=False,
        )

        table_from_sql_entities = job_from_sql.to_arrow()
        df_from_sql_entities = (
            table_from_sql_entities.to_pandas()[expected_df_query.columns]
            .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
            .drop_duplicates()
            .reset_index(drop=True)
        )

        for col in df_from_sql_entities.columns:
            expected_df_query[col] = expected_df_query[col].astype(
                df_from_sql_entities[col].dtype
            )

        assert_frame_equal(expected_df_query, df_from_sql_entities)

    job_from_df = store.get_historical_features(
        entity_df=entity_df_with_request_data,
        features=[
            "driver_stats:conv_rate",
            "driver_stats:avg_daily_trips",
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "conv_rate_plus_100:conv_rate_plus_100",
            "conv_rate_plus_100:conv_rate_plus_100_rounded",
            "conv_rate_plus_100:conv_rate_plus_val_to_add",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "driver_age:driver_age",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_df_entities = job_from_df.to_df()

    print(f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}")
    end_time = datetime.utcnow()
    print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"))

    assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns)
    expected_df: pd.DataFrame = (
        expected_df.sort_values(
            by=[event_timestamp, "order_id", "driver_id", "customer_id"]
        )
        .drop_duplicates()
        .reset_index(drop=True)
    )
    actual_df_from_df_entities = (
        actual_df_from_df_entities[expected_df.columns]
        .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
        .drop_duplicates()
        .reset_index(drop=True)
    )

    assert_frame_equal(
        expected_df, actual_df_from_df_entities, check_dtype=False,
    )
    assert_feature_service_correctness(
        store,
        feature_service,
        full_feature_names,
        entity_df_with_request_data,
        expected_df,
        event_timestamp,
    )
    assert_feature_service_entity_mapping_correctness(
        store,
        feature_service_entity_mapping,
        full_feature_names,
        entity_df_with_request_data,
        full_expected_df,
        event_timestamp,
    )

    table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas()

    columns_expected_in_table = expected_df.columns.tolist()

    table_from_df_entities = (
        table_from_df_entities[columns_expected_in_table]
        .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"])
        .drop_duplicates()
        .reset_index(drop=True)
    )
    assert_frame_equal(actual_df_from_df_entities, table_from_df_entities)

    # If request data is missing that's needed for on demand transform, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=entity_df,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "conv_rate_plus_100:conv_rate_plus_100",
                "conv_rate_plus_100:conv_rate_plus_val_to_add",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )
    # If request data is missing that's needed for a request feature view, throw an error
    with pytest.raises(RequestDataNotFoundInEntityDfException):
        store.get_historical_features(
            entity_df=entity_df,
            features=[
                "driver_stats:conv_rate",
                "driver_stats:avg_daily_trips",
                "customer_profile:current_balance",
                "customer_profile:avg_passenger_count",
                "customer_profile:lifetime_trip_count",
                "driver_age:driver_age",
                "global_stats:num_rides",
                "global_stats:avg_ride_length",
            ],
            full_feature_names=full_feature_names,
        )
Example #29
0
def test_historical_features(environment, universal_data_sources,
                             full_feature_names):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    feature_views = construct_universal_feature_views(data_sources)

    entity_df_with_request_data = datasets.entity_df.copy(deep=True)
    entity_df_with_request_data["val_to_add"] = [
        i for i in range(len(entity_df_with_request_data))
    ]
    entity_df_with_request_data["driver_age"] = [
        i + 100 for i in range(len(entity_df_with_request_data))
    ]

    feature_service = FeatureService(
        name="convrate_plus100",
        features=[
            feature_views.driver[["conv_rate"]], feature_views.driver_odfv
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views.location.with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views.location.with_name("destination").with_join_key_map(
                {"location_id": "destination_id"}),
        ],
    )

    store.apply([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
        *feature_views.values(),
    ])

    event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                       in datasets.orders_df.columns else "e_ts")
    full_expected_df = get_expected_training_df(
        datasets.customer_df,
        feature_views.customer,
        datasets.driver_df,
        feature_views.driver,
        datasets.orders_df,
        feature_views.order,
        datasets.location_df,
        feature_views.location,
        datasets.global_df,
        feature_views.global_fv,
        datasets.field_mapping_df,
        feature_views.field_mapping,
        entity_df_with_request_data,
        event_timestamp,
        full_feature_names,
    )

    # Only need the shadow entities features in the FeatureService test
    expected_df = full_expected_df.drop(
        columns=["origin__temperature", "destination__temperature"], )

    job_from_df = store.get_historical_features(
        entity_df=entity_df_with_request_data,
        features=[
            "driver_stats:conv_rate",
            "driver_stats:avg_daily_trips",
            "customer_profile:current_balance",
            "customer_profile:avg_passenger_count",
            "customer_profile:lifetime_trip_count",
            "conv_rate_plus_100:conv_rate_plus_100",
            "conv_rate_plus_100:conv_rate_plus_100_rounded",
            "conv_rate_plus_100:conv_rate_plus_val_to_add",
            "order:order_is_success",
            "global_stats:num_rides",
            "global_stats:avg_ride_length",
            "field_mapping:feature_name",
        ],
        full_feature_names=full_feature_names,
    )

    start_time = datetime.utcnow()
    actual_df_from_df_entities = job_from_df.to_df()

    print(
        f"actual_df_from_df_entities shape: {actual_df_from_df_entities.shape}"
    )
    end_time = datetime.utcnow()
    print(
        str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
            ))

    assert sorted(expected_df.columns) == sorted(
        actual_df_from_df_entities.columns)
    assert_frame_equal(
        expected_df,
        actual_df_from_df_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )

    assert_feature_service_correctness(
        store,
        feature_service,
        full_feature_names,
        entity_df_with_request_data,
        expected_df,
        event_timestamp,
    )
    assert_feature_service_entity_mapping_correctness(
        store,
        feature_service_entity_mapping,
        full_feature_names,
        entity_df_with_request_data,
        full_expected_df,
        event_timestamp,
    )
    table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas()

    assert_frame_equal(
        expected_df,
        table_from_df_entities,
        keys=[event_timestamp, "order_id", "driver_id", "customer_id"],
    )
Example #30
0
def test_online_retrieval(environment, universal_data_sources,
                          full_feature_names):

    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    feature_views = construct_universal_feature_views(data_sources)

    feature_service = FeatureService(
        "convrate_plus100",
        features=[
            feature_views["driver"][["conv_rate"]],
            feature_views["driver_odfv"],
            feature_views["driver_age_request_fv"],
        ],
    )
    feature_service_entity_mapping = FeatureService(
        name="entity_mapping",
        features=[
            feature_views["location"].with_name("origin").with_join_key_map(
                {"location_id": "origin_id"}),
            feature_views["location"].with_name(
                "destination").with_join_key_map(
                    {"location_id": "destination_id"}),
        ],
    )

    feast_objects = []
    feast_objects.extend(feature_views.values())
    feast_objects.extend([
        driver(),
        customer(),
        location(),
        feature_service,
        feature_service_entity_mapping,
    ])
    fs.apply(feast_objects)
    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )

    entity_sample = datasets["orders"].sample(10)[[
        "customer_id", "driver_id", "order_id", "event_timestamp"
    ]]
    orders_df = datasets["orders"][(
        datasets["orders"]["customer_id"].isin(entity_sample["customer_id"])
        & datasets["orders"]["driver_id"].isin(entity_sample["driver_id"]))]

    sample_drivers = entity_sample["driver_id"]
    drivers_df = datasets["driver"][datasets["driver"]["driver_id"].isin(
        sample_drivers)]

    sample_customers = entity_sample["customer_id"]
    customers_df = datasets["customer"][datasets["customer"]
                                        ["customer_id"].isin(sample_customers)]

    location_pairs = np.array(
        list(itertools.permutations(entities["location"], 2)))
    sample_location_pairs = location_pairs[np.random.choice(
        len(location_pairs), 10)].T
    origins_df = datasets["location"][datasets["location"]["location_id"].isin(
        sample_location_pairs[0])]
    destinations_df = datasets["location"][
        datasets["location"]["location_id"].isin(sample_location_pairs[1])]

    global_df = datasets["global"]

    entity_rows = [{
        "driver": d,
        "customer_id": c,
        "val_to_add": 50,
        "driver_age": 25
    } for (d, c) in zip(sample_drivers, sample_customers)]

    feature_refs = [
        "driver_stats:conv_rate",
        "driver_stats:avg_daily_trips",
        "customer_profile:current_balance",
        "customer_profile:avg_passenger_count",
        "customer_profile:lifetime_trip_count",
        "conv_rate_plus_100:conv_rate_plus_100",
        "conv_rate_plus_100:conv_rate_plus_val_to_add",
        "order:order_is_success",
        "global_stats:num_rides",
        "global_stats:avg_ride_length",
        "driver_age:driver_age",
    ]
    unprefixed_feature_refs = [
        f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f
    ]
    # Remove the on demand feature view output features, since they're not present in the source dataframe
    unprefixed_feature_refs.remove("conv_rate_plus_100")
    unprefixed_feature_refs.remove("conv_rate_plus_val_to_add")

    online_features = fs.get_online_features(
        features=feature_refs,
        entity_rows=entity_rows,
        full_feature_names=full_feature_names,
    )
    assert online_features is not None

    online_features_dict = online_features.to_dict()
    keys = online_features_dict.keys()
    assert (
        len(keys) == len(feature_refs) + 3
    )  # Add three for the driver id and the customer id entity keys + val_to_add request data.
    for feature in feature_refs:
        # full_feature_names does not apply to request feature views
        if full_feature_names and feature != "driver_age:driver_age":
            assert feature.replace(":", "__") in keys
        else:
            assert feature.rsplit(":", 1)[-1] in keys
            assert ("driver_stats" not in keys
                    and "customer_profile" not in keys and "order" not in keys
                    and "global_stats" not in keys)

    tc = unittest.TestCase()
    for i, entity_row in enumerate(entity_rows):
        df_features = get_latest_feature_values_from_dataframes(
            driver_df=drivers_df,
            customer_df=customers_df,
            orders_df=orders_df,
            global_df=global_df,
            entity_row=entity_row,
        )

        assert df_features["customer_id"] == online_features_dict[
            "customer_id"][i]
        assert df_features["driver_id"] == online_features_dict["driver_id"][i]
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name("conv_rate_plus_100",
                                                       full_feature_names)][i],
            df_features["conv_rate"] + 100,
            delta=0.0001,
        )
        tc.assertAlmostEqual(
            online_features_dict[response_feature_name(
                "conv_rate_plus_val_to_add", full_feature_names)][i],
            df_features["conv_rate"] + df_features["val_to_add"],
            delta=0.0001,
        )
        for unprefixed_feature_ref in unprefixed_feature_refs:
            tc.assertAlmostEqual(
                df_features[unprefixed_feature_ref],
                online_features_dict[response_feature_name(
                    unprefixed_feature_ref, full_feature_names)][i],
                delta=0.0001,
            )

    # Check what happens for missing values
    missing_responses_dict = fs.get_online_features(
        features=feature_refs,
        entity_rows=[{
            "driver": 0,
            "customer_id": 0,
            "val_to_add": 100,
            "driver_age": 125
        }],
        full_feature_names=full_feature_names,
    ).to_dict()
    assert missing_responses_dict is not None
    for unprefixed_feature_ref in unprefixed_feature_refs:
        if unprefixed_feature_ref not in {
                "num_rides", "avg_ride_length", "driver_age"
        }:
            tc.assertIsNone(missing_responses_dict[response_feature_name(
                unprefixed_feature_ref, full_feature_names)][0])

    # Check what happens for missing request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        fs.get_online_features(
            features=feature_refs,
            entity_rows=[{
                "driver": 0,
                "customer_id": 0
            }],
            full_feature_names=full_feature_names,
        ).to_dict()

    # Also with request data
    with pytest.raises(RequestDataNotFoundInEntityRowsException):
        fs.get_online_features(
            features=feature_refs,
            entity_rows=[{
                "driver": 0,
                "customer_id": 0,
                "val_to_add": 20
            }],
            full_feature_names=full_feature_names,
        ).to_dict()

    assert_feature_service_correctness(
        fs,
        feature_service,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        global_df,
    )

    entity_rows = [{
        "driver": driver,
        "customer_id": customer,
        "origin_id": origin,
        "destination_id": destination,
    } for (driver, customer, origin, destination
           ) in zip(sample_drivers, sample_customers, *sample_location_pairs)]
    assert_feature_service_entity_mapping_correctness(
        fs,
        feature_service_entity_mapping,
        entity_rows,
        full_feature_names,
        drivers_df,
        customers_df,
        orders_df,
        origins_df,
        destinations_df,
    )