Ejemplo n.º 1
0
def test_diff_between_feature_views(simple_dataset_1):
    with prep_file_source(df=simple_dataset_1,
                          event_timestamp_column="ts_1") as file_source:
        pre_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={
                "when": "before"
            },
        ).to_proto()
        post_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={
                "when": "after"
            },
        ).to_proto()

        fco_diffs = diff_between(pre_changed, pre_changed, "feature view")
        assert len(fco_diffs.fco_property_diffs) == 0

        fco_diffs = diff_between(pre_changed, post_changed, "feature view")
        assert len(fco_diffs.fco_property_diffs) == 1

        assert fco_diffs.fco_property_diffs[0].property_name == "tags"
        assert fco_diffs.fco_property_diffs[0].val_existing == {
            "when": "before"
        }
        assert fco_diffs.fco_property_diffs[0].val_declared == {
            "when": "after"
        }
Ejemplo n.º 2
0
def test_feature_view_inference_success(test_feature_store, dataframe_source):
    with prep_file_source(
        df=dataframe_source, event_timestamp_column="ts_1"
    ) as file_source:
        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=file_source,
            tags={},
        )

        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=simple_bq_source_using_table_ref_arg(dataframe_source, "ts_1"),
            tags={},
        )

        fv3 = FeatureView(
            name="fv3",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=simple_bq_source_using_query_arg(dataframe_source, "ts_1"),
            tags={},
        )

        test_feature_store.apply([fv1, fv2, fv3])  # Register Feature Views
        feature_view_1 = test_feature_store.list_feature_views()[0]
        feature_view_2 = test_feature_store.list_feature_views()[1]
        feature_view_3 = test_feature_store.list_feature_views()[2]

        actual_file_source = {
            (feature.name, feature.dtype) for feature in feature_view_1.features
        }
        actual_bq_using_table_ref_arg_source = {
            (feature.name, feature.dtype) for feature in feature_view_2.features
        }
        actual_bq_using_query_arg_source = {
            (feature.name, feature.dtype) for feature in feature_view_3.features
        }
        expected = {
            ("float_col", ValueType.DOUBLE),
            ("int64_col", ValueType.INT64),
            ("string_col", ValueType.STRING),
        }

        assert (
            expected
            == actual_file_source
            == actual_bq_using_table_ref_arg_source
            == actual_bq_using_query_arg_source
        )

        test_feature_store.teardown()
Ejemplo n.º 3
0
def test_update_entities_with_inferred_types_from_feature_views(
        simple_dataset_1, simple_dataset_2):
    with prep_file_source(
            df=simple_dataset_1,
            event_timestamp_column="ts_1") as file_source, prep_file_source(
                df=simple_dataset_2,
                event_timestamp_column="ts_1") as file_source_2:

        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )
        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source_2,
            ttl=None,
        )

        actual_1 = Entity(name="id", join_key="id_join_key")
        actual_2 = Entity(name="id", join_key="id_join_key")

        update_entities_with_inferred_types_from_feature_views(
            [actual_1], [fv1], RepoConfig(provider="local", project="test"))
        update_entities_with_inferred_types_from_feature_views(
            [actual_2], [fv2], RepoConfig(provider="local", project="test"))
        assert actual_1 == Entity(name="id",
                                  join_key="id_join_key",
                                  value_type=ValueType.INT64)
        assert actual_2 == Entity(name="id",
                                  join_key="id_join_key",
                                  value_type=ValueType.STRING)

        with pytest.raises(RegistryInferenceFailure):
            # two viable data types
            update_entities_with_inferred_types_from_feature_views(
                [Entity(name="id", join_key="id_join_key")],
                [fv1, fv2],
                RepoConfig(provider="local", project="test"),
            )
Ejemplo n.º 4
0
def test_reapply_feature_view_success(test_feature_store, dataframe_source):
    with prep_file_source(df=dataframe_source,
                          event_timestamp_column="ts_1") as file_source:

        e = Entity(name="id",
                   join_keys=["id_join_key"],
                   value_type=ValueType.STRING)

        # Create Feature View
        fv1 = FeatureView(
            name="my_feature_view_1",
            schema=[Field(name="string_col", dtype=String)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )

        # Register Feature View
        test_feature_store.apply([fv1, e])

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 0

        # Run materialization
        test_feature_store.materialize(datetime(2020, 1, 1),
                                       datetime(2021, 1, 1))

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 1

        # Apply again
        test_feature_store.apply([fv1])

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 1

        # Change and apply Feature View
        fv1 = FeatureView(
            name="my_feature_view_1",
            schema=[Field(name="int64_col", dtype=Int64)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )
        test_feature_store.apply([fv1])

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 0

        test_feature_store.teardown()
Ejemplo n.º 5
0
def test_tag_objects_for_keep_delete_update_add(simple_dataset_1):
    with prep_file_source(df=simple_dataset_1,
                          event_timestamp_column="ts_1") as file_source:
        to_delete = FeatureView(
            name="to_delete",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )
        unchanged_fv = FeatureView(
            name="fv1",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )
        pre_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={"when": "before"},
        )
        post_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={"when": "after"},
        )
        to_add = FeatureView(
            name="to_add",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )

        keep, delete, update, add = tag_objects_for_keep_delete_update_add(
            [unchanged_fv, pre_changed, to_delete],
            [unchanged_fv, post_changed, to_add])

        assert len(list(keep)) == 2
        assert unchanged_fv in keep
        assert pre_changed in keep
        assert post_changed not in keep
        assert len(list(delete)) == 1
        assert to_delete in delete
        assert len(list(update)) == 2
        assert unchanged_fv in update
        assert post_changed in update
        assert pre_changed not in update
        assert len(list(add)) == 1
        assert to_add in add
Ejemplo n.º 6
0
def test_update_data_sources_with_inferred_event_timestamp_col(
        simple_dataset_1):
    df_with_two_viable_timestamp_cols = simple_dataset_1.copy(deep=True)
    df_with_two_viable_timestamp_cols["ts_2"] = simple_dataset_1["ts_1"]

    with prep_file_source(df=simple_dataset_1) as file_source:
        data_sources = [
            file_source,
            simple_bq_source_using_table_ref_arg(simple_dataset_1),
            simple_bq_source_using_query_arg(simple_dataset_1),
        ]
        update_data_sources_with_inferred_event_timestamp_col(
            data_sources, RepoConfig(provider="local", project="test"))
        actual_event_timestamp_cols = [
            source.event_timestamp_column for source in data_sources
        ]

        assert actual_event_timestamp_cols == ["ts_1", "ts_1", "ts_1"]

    with prep_file_source(df=df_with_two_viable_timestamp_cols) as file_source:
        with pytest.raises(RegistryInferenceFailure):
            # two viable event_timestamp_columns
            update_data_sources_with_inferred_event_timestamp_col(
                [file_source], RepoConfig(provider="local", project="test"))
Ejemplo n.º 7
0
def test_write_to_online_store_event_check(local_redis_environment):
    if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True":
        return
    fs = local_redis_environment.feature_store

    # write same data points 3 with different timestamps
    now = pd.Timestamp(datetime.datetime.utcnow()).round("ms")
    hour_ago = pd.Timestamp(datetime.datetime.utcnow() -
                            timedelta(hours=1)).round("ms")
    latest = pd.Timestamp(datetime.datetime.utcnow() +
                          timedelta(seconds=1)).round("ms")

    data = {
        "id": [123, 567, 890],
        "string_col": ["OLD_FEATURE", "LATEST_VALUE2", "LATEST_VALUE3"],
        "ts_1": [hour_ago, now, now],
    }
    dataframe_source = pd.DataFrame(data)
    with prep_file_source(df=dataframe_source,
                          event_timestamp_column="ts_1") as file_source:
        e = Entity(name="id", value_type=ValueType.STRING)

        # Create Feature View
        fv1 = FeatureView(
            name="feature_view_123",
            schema=[Field(name="string_col", dtype=String)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )
        # Register Feature View and Entity
        fs.apply([fv1, e])

        #  data to ingest into Online Store (recent)
        data = {
            "id": [123],
            "string_col": ["hi_123"],
            "ts_1": [now],
        }
        df_data = pd.DataFrame(data)

        # directly ingest data into the Online Store
        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(features=["feature_view_123:string_col"],
                                    entity_rows=[{
                                        "id": 123
                                    }]).to_df()
        assert df["string_col"].iloc[0] == "hi_123"

        # data to ingest into Online Store (1 hour delayed data)
        # should now overwrite features for id=123 because it's less recent data
        data = {
            "id": [123, 567, 890],
            "string_col": ["bye_321", "hello_123", "greetings_321"],
            "ts_1": [hour_ago, hour_ago, hour_ago],
        }
        df_data = pd.DataFrame(data)

        # directly ingest data into the Online Store
        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "hi_123"
        assert df["string_col"].iloc[1] == "hello_123"
        assert df["string_col"].iloc[2] == "greetings_321"

        # should overwrite string_col for id=123 because it's most recent based on event_timestamp
        data = {
            "id": [123],
            "string_col": ["LATEST_VALUE"],
            "ts_1": [latest],
        }
        df_data = pd.DataFrame(data)

        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "LATEST_VALUE"
        assert df["string_col"].iloc[1] == "hello_123"
        assert df["string_col"].iloc[2] == "greetings_321"

        # writes to online store via datasource (dataframe_source) materialization
        fs.materialize(
            start_date=datetime.datetime.now() - timedelta(hours=12),
            end_date=datetime.datetime.utcnow(),
        )

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "LATEST_VALUE"
        assert df["string_col"].iloc[1] == "LATEST_VALUE2"
        assert df["string_col"].iloc[2] == "LATEST_VALUE3"