Esempio n. 1
0
def test_apply_duplicated_featureview_names(feature_store_with_local_registry):
    """ Test applying feature views with duplicated names"""

    driver_stats = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=10),
        online=False,
        input=FileSource(path="driver_stats.parquet"),
        tags={},
    )

    customer_stats = FeatureView(
        name="driver_hourly_stats",
        entities=["id"],
        ttl=timedelta(seconds=10),
        online=False,
        input=FileSource(path="customer_stats.parquet"),
        tags={},
    )
    try:
        feature_store_with_local_registry.apply([driver_stats, customer_stats])
        error = None
    except ValueError as e:
        error = e
    assert (
        isinstance(error, ValueError)
        and "Please ensure that all feature view names are unique" in error.args[0]
    )

    feature_store_with_local_registry.teardown()
Esempio n. 2
0
def test_diff_between_feature_views(simple_dataset_1):
    with prep_file_source(df=simple_dataset_1,
                          event_timestamp_column="ts_1") as file_source:
        pre_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={
                "when": "before"
            },
        ).to_proto()
        post_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={
                "when": "after"
            },
        ).to_proto()

        fco_diffs = diff_between(pre_changed, pre_changed, "feature view")
        assert len(fco_diffs.fco_property_diffs) == 0

        fco_diffs = diff_between(pre_changed, post_changed, "feature view")
        assert len(fco_diffs.fco_property_diffs) == 1

        assert fco_diffs.fco_property_diffs[0].property_name == "tags"
        assert fco_diffs.fco_property_diffs[0].val_existing == {
            "when": "before"
        }
        assert fco_diffs.fco_property_diffs[0].val_declared == {
            "when": "after"
        }
Esempio n. 3
0
def test_update_entities_with_inferred_types_from_feature_views(
        simple_dataset_1, simple_dataset_2):
    with prep_file_source(
            df=simple_dataset_1,
            event_timestamp_column="ts_1") as file_source, prep_file_source(
                df=simple_dataset_2,
                event_timestamp_column="ts_1") as file_source_2:

        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            input=file_source,
            ttl=None,
        )
        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            input=file_source_2,
            ttl=None,
        )

        actual_1 = Entity(name="id")
        actual_2 = Entity(name="id")

        update_entities_with_inferred_types_from_feature_views([actual_1],
                                                               [fv1])
        update_entities_with_inferred_types_from_feature_views([actual_2],
                                                               [fv2])
        assert actual_1 == Entity(name="id", value_type=ValueType.INT64)
        assert actual_2 == Entity(name="id", value_type=ValueType.STRING)

        with pytest.raises(RegistryInferenceFailure):
            # two viable data types
            update_entities_with_inferred_types_from_feature_views(
                [Entity(name="id")], [fv1, fv2])
Esempio n. 4
0
def test_apply_conflicting_featureview_names(
        feature_store_with_local_registry):
    """ Test applying feature views with non-case-insensitively unique names"""

    driver_stats = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=10),
        online=False,
        batch_source=FileSource(path="driver_stats.parquet"),
        tags={},
    )

    customer_stats = FeatureView(
        name="DRIVER_HOURLY_STATS",
        entities=["id"],
        ttl=timedelta(seconds=10),
        online=False,
        batch_source=FileSource(path="customer_stats.parquet"),
        tags={},
    )
    try:
        feature_store_with_local_registry.apply([driver_stats, customer_stats])
        error = None
    except ValueError as e:
        error = e
    assert (
        isinstance(error, ValueError) and
        "Please ensure that all feature view names are case-insensitively unique"
        in error.args[0])

    feature_store_with_local_registry.teardown()
Esempio n. 5
0
def test_infer_entity_value_type_from_feature_views(simple_dataset_1,
                                                    simple_dataset_2):
    with prep_file_source(
            df=simple_dataset_1,
            event_timestamp_column="ts_1") as file_source, prep_file_source(
                df=simple_dataset_2,
                event_timestamp_column="ts_1") as file_source_2:

        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            input=file_source,
            ttl=None,
        )
        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            input=file_source_2,
            ttl=None,
        )

        actual_1 = infer_entity_value_type_from_feature_views(
            [Entity(name="id")], [fv1])
        actual_2 = infer_entity_value_type_from_feature_views(
            [Entity(name="id")], [fv2])
        assert actual_1 == [Entity(name="id", value_type=ValueType.INT64)]
        assert actual_2 == [Entity(name="id", value_type=ValueType.STRING)]

        with pytest.raises(ValueError):
            # two viable data types
            infer_entity_value_type_from_feature_views([Entity(name="id")],
                                                       [fv1, fv2])
Esempio n. 6
0
def test_feature_view_inference_success(test_feature_store, dataframe_source):
    with prep_file_source(
        df=dataframe_source, event_timestamp_column="ts_1"
    ) as file_source:
        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=file_source,
            tags={},
        )

        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=simple_bq_source_using_table_ref_arg(dataframe_source, "ts_1"),
            tags={},
        )

        fv3 = FeatureView(
            name="fv3",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=simple_bq_source_using_query_arg(dataframe_source, "ts_1"),
            tags={},
        )

        test_feature_store.apply([fv1, fv2, fv3])  # Register Feature Views
        feature_view_1 = test_feature_store.list_feature_views()[0]
        feature_view_2 = test_feature_store.list_feature_views()[1]
        feature_view_3 = test_feature_store.list_feature_views()[2]

        actual_file_source = {
            (feature.name, feature.dtype) for feature in feature_view_1.features
        }
        actual_bq_using_table_ref_arg_source = {
            (feature.name, feature.dtype) for feature in feature_view_2.features
        }
        actual_bq_using_query_arg_source = {
            (feature.name, feature.dtype) for feature in feature_view_3.features
        }
        expected = {
            ("float_col", ValueType.DOUBLE),
            ("int64_col", ValueType.INT64),
            ("string_col", ValueType.STRING),
        }

        assert (
            expected
            == actual_file_source
            == actual_bq_using_table_ref_arg_source
            == actual_bq_using_query_arg_source
        )

        test_feature_store.teardown()
Esempio n. 7
0
def test_apply_object_and_read(test_feature_store):
    assert isinstance(test_feature_store, FeatureStore)
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
    )

    e1 = Entity(name="fs1_my_entity_1",
                value_type=ValueType.STRING,
                description="something")

    e2 = Entity(name="fs1_my_entity_2",
                value_type=ValueType.STRING,
                description="something")

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    fv2 = FeatureView(
        name="my_feature_view_2",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1, e1, fv2, e2])

    fv1_actual = test_feature_store.get_feature_view("my_feature_view_1")
    e1_actual = test_feature_store.get_entity("fs1_my_entity_1")

    assert fv1 == fv1_actual
    assert e1 == e1_actual
    assert fv2 != fv1_actual
    assert e2 != e1_actual

    test_feature_store.teardown()
Esempio n. 8
0
def test_reapply_feature_view_success(test_feature_store, dataframe_source):
    with prep_file_source(df=dataframe_source,
                          event_timestamp_column="ts_1") as file_source:

        e = Entity(name="id",
                   join_keys=["id_join_key"],
                   value_type=ValueType.STRING)

        # Create Feature View
        fv1 = FeatureView(
            name="my_feature_view_1",
            schema=[Field(name="string_col", dtype=String)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )

        # Register Feature View
        test_feature_store.apply([fv1, e])

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 0

        # Run materialization
        test_feature_store.materialize(datetime(2020, 1, 1),
                                       datetime(2021, 1, 1))

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 1

        # Apply again
        test_feature_store.apply([fv1])

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 1

        # Change and apply Feature View
        fv1 = FeatureView(
            name="my_feature_view_1",
            schema=[Field(name="int64_col", dtype=Int64)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )
        test_feature_store.apply([fv1])

        # Check Feature View
        fv_stored = test_feature_store.get_feature_view(fv1.name)
        assert len(fv_stored.materialization_intervals) == 0

        test_feature_store.teardown()
Esempio n. 9
0
def test_tag_objects_for_keep_delete_update_add(simple_dataset_1):
    with prep_file_source(df=simple_dataset_1,
                          event_timestamp_column="ts_1") as file_source:
        to_delete = FeatureView(
            name="to_delete",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )
        unchanged_fv = FeatureView(
            name="fv1",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )
        pre_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={"when": "before"},
        )
        post_changed = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
            tags={"when": "after"},
        )
        to_add = FeatureView(
            name="to_add",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )

        keep, delete, update, add = tag_objects_for_keep_delete_update_add(
            [unchanged_fv, pre_changed, to_delete],
            [unchanged_fv, post_changed, to_add])

        assert len(list(keep)) == 2
        assert unchanged_fv in keep
        assert pre_changed in keep
        assert post_changed not in keep
        assert len(list(delete)) == 1
        assert to_delete in delete
        assert len(list(update)) == 2
        assert unchanged_fv in update
        assert post_changed in update
        assert pre_changed not in update
        assert len(list(add)) == 1
        assert to_add in add
Esempio n. 10
0
def test_get_column_names_preserves_feature_ordering():
    entity = Entity("my-entity",
                    description="My entity",
                    value_type=ValueType.STRING)
    fv = FeatureView(
        name="my-fv",
        entities=["my-entity"],
        ttl=timedelta(days=1),
        batch_source=BigQuerySource(table="non-existent-mock"),
        schema=[
            Field(name="a", dtype=String),
            Field(name="b", dtype=String),
            Field(name="c", dtype=String),
            Field(name="d", dtype=String),
            Field(name="e", dtype=String),
            Field(name="f", dtype=String),
            Field(name="g", dtype=String),
            Field(name="h", dtype=String),
            Field(name="i", dtype=String),
            Field(name="j", dtype=String),
        ],
    )

    _, feature_list, _, _ = _get_column_names(fv, [entity])
    assert feature_list == ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
Esempio n. 11
0
def test_apply_feature_view_integration(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_feature_store.get_feature_view("my_feature_view_1")
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == String
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == Array(String)
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == Array(Bytes)
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_feature_store.delete_feature_view("my_feature_view_1")
    feature_views = test_feature_store.list_feature_views()
    assert len(feature_views) == 0

    test_feature_store.teardown()
Esempio n. 12
0
def test_apply_feature_view_integration(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == ValueType.INT64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == ValueType.STRING
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == ValueType.STRING_LIST
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_feature_store.get_feature_view("my_feature_view_1")
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == ValueType.INT64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == ValueType.STRING
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == ValueType.STRING_LIST
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == ValueType.BYTES_LIST
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_feature_store.delete_feature_view("my_feature_view_1")
    feature_views = test_feature_store.list_feature_views()
    assert len(feature_views) == 0

    test_feature_store.teardown()
Esempio n. 13
0
def get_feature_view(data_source: Union[FileSource, BigQuerySource]) -> FeatureView:
    return FeatureView(
        name="test_bq_correctness",
        entities=["driver"],
        features=[Feature("value", ValueType.FLOAT)],
        ttl=timedelta(days=5),
        input=data_source,
    )
def get_feature_view(data_source: DataSource) -> FeatureView:
    return FeatureView(
        name="test_bq_correctness",
        entities=["driver"],
        features=[Feature("value", ValueType.FLOAT)],
        ttl=timedelta(days=5),
        batch_source=data_source,
    )
    def test_bigquery_query_to_datastore_correctness(self):
        # create dataset
        ts = pd.Timestamp.now(tz="UTC").round("ms")
        data = {
            "id": [1, 2, 1],
            "value": [0.1, 0.2, 0.3],
            "ts_1": [ts - timedelta(minutes=2), ts, ts],
            "created_ts": [ts, ts, ts],
        }
        df = pd.DataFrame.from_dict(data)

        # load dataset into BigQuery
        job_config = bigquery.LoadJobConfig()
        table_id = f"{self.gcp_project}.{self.bigquery_dataset}.query_correctness_{int(time.time())}"
        query = f"SELECT * FROM `{table_id}`"
        job = self.client.load_table_from_dataframe(df,
                                                    table_id,
                                                    job_config=job_config)
        job.result()

        # create FeatureView
        fv = FeatureView(
            name="test_bq_query_correctness",
            entities=["driver_id"],
            features=[Feature("value", ValueType.FLOAT)],
            ttl=timedelta(minutes=5),
            input=BigQuerySource(
                event_timestamp_column="ts",
                created_timestamp_column="created_ts",
                field_mapping={
                    "ts_1": "ts",
                    "id": "driver_id"
                },
                date_partition_column="",
                query=query,
            ),
        )
        config = RepoConfig(
            metadata_store="./metadata.db",
            project=f"test_bq_query_correctness_{int(time.time())}",
            provider="gcp",
        )
        fs = FeatureStore(config=config)
        fs.apply([fv])

        # run materialize()
        fs.materialize(
            [fv.name],
            datetime.utcnow() - timedelta(minutes=5),
            datetime.utcnow() - timedelta(minutes=0),
        )

        # check result of materialize()
        response_dict = fs.get_online_features([f"{fv.name}:value"],
                                               [{
                                                   "driver_id": 1
                                               }]).to_dict()
        assert abs(response_dict[f"{fv.name}:value"][0] - 0.3) < 1e-6
Esempio n. 16
0
def test_update_feature_views_with_inferred_features():
    file_source = FileSource(name="test", path="test path")
    entity1 = Entity(name="test1", join_keys=["test_column_1"])
    entity2 = Entity(name="test2", join_keys=["test_column_2"])
    feature_view_1 = FeatureView(
        name="test1",
        entities=[entity1],
        schema=[
            Field(name="feature", dtype=Float32),
            Field(name="test_column_1", dtype=String),
        ],
        source=file_source,
    )
    feature_view_2 = FeatureView(
        name="test2",
        entities=[entity1, entity2],
        schema=[
            Field(name="feature", dtype=Float32),
            Field(name="test_column_1", dtype=String),
            Field(name="test_column_2", dtype=String),
        ],
        source=file_source,
    )

    assert len(feature_view_1.schema) == 2
    assert len(feature_view_1.features) == 2

    # The entity field should be deleted from the schema and features of the feature view.
    update_feature_views_with_inferred_features([feature_view_1], [entity1],
                                                RepoConfig(provider="local",
                                                           project="test"))
    assert len(feature_view_1.schema) == 1
    assert len(feature_view_1.features) == 1

    assert len(feature_view_2.schema) == 3
    assert len(feature_view_2.features) == 3

    # The entity fields should be deleted from the schema and features of the feature view.
    update_feature_views_with_inferred_features(
        [feature_view_2],
        [entity1, entity2],
        RepoConfig(provider="local", project="test"),
    )
    assert len(feature_view_2.schema) == 1
    assert len(feature_view_2.features) == 1
Esempio n. 17
0
def test_hash():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view_1 = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    feature_view_2 = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    feature_view_3 = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[Field(name="feature1", dtype=Float32)],
        source=file_source,
    )
    feature_view_4 = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[Field(name="feature1", dtype=Float32)],
        source=file_source,
        description="test",
    )

    s1 = {feature_view_1, feature_view_2}
    assert len(s1) == 1

    s2 = {feature_view_1, feature_view_3}
    assert len(s2) == 2

    s3 = {feature_view_3, feature_view_4}
    assert len(s3) == 2

    s4 = {feature_view_1, feature_view_2, feature_view_3, feature_view_4}
    assert len(s4) == 3
Esempio n. 18
0
def test_apply_data_source(test_registry: Registry):
    # Create Feature Views
    batch_source = FileSource(
        name="test_source",
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register data source and feature view
    test_registry.apply_data_source(batch_source, project, commit=False)
    test_registry.apply_feature_view(fv1, project, commit=True)

    registry_feature_views = test_registry.list_feature_views(project)
    registry_data_sources = test_registry.list_data_sources(project)
    assert len(registry_feature_views) == 1
    assert len(registry_data_sources) == 1
    registry_feature_view = registry_feature_views[0]
    assert registry_feature_view.batch_source == batch_source
    registry_data_source = registry_data_sources[0]
    assert registry_data_source == batch_source

    # Check that change to batch source propagates
    batch_source.timestamp_field = "new_ts_col"
    test_registry.apply_data_source(batch_source, project, commit=False)
    test_registry.apply_feature_view(fv1, project, commit=True)
    registry_feature_views = test_registry.list_feature_views(project)
    registry_data_sources = test_registry.list_data_sources(project)
    assert len(registry_feature_views) == 1
    assert len(registry_data_sources) == 1
    registry_feature_view = registry_feature_views[0]
    assert registry_feature_view.batch_source == batch_source
    registry_batch_source = test_registry.list_data_sources(project)[0]
    assert registry_batch_source == batch_source

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Esempio n. 19
0
def test_update_entities_with_inferred_types_from_feature_views(
        simple_dataset_1, simple_dataset_2):
    with prep_file_source(
            df=simple_dataset_1,
            event_timestamp_column="ts_1") as file_source, prep_file_source(
                df=simple_dataset_2,
                event_timestamp_column="ts_1") as file_source_2:

        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            batch_source=file_source,
            ttl=None,
        )
        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            batch_source=file_source_2,
            ttl=None,
        )

        actual_1 = Entity(name="id", join_keys=["id_join_key"])
        actual_2 = Entity(name="id", join_keys=["id_join_key"])

        update_entities_with_inferred_types_from_feature_views(
            [actual_1], [fv1], RepoConfig(provider="local", project="test"))
        update_entities_with_inferred_types_from_feature_views(
            [actual_2], [fv2], RepoConfig(provider="local", project="test"))
        assert actual_1 == Entity(name="id",
                                  join_keys=["id_join_key"],
                                  value_type=ValueType.INT64)
        assert actual_2 == Entity(name="id",
                                  join_keys=["id_join_key"],
                                  value_type=ValueType.STRING)

        with pytest.raises(RegistryInferenceFailure):
            # two viable data types
            update_entities_with_inferred_types_from_feature_views(
                [Entity(name="id", join_keys=["id_join_key"])],
                [fv1, fv2],
                RepoConfig(provider="local", project="test"),
            )
Esempio n. 20
0
def create_driver_hourly_stats_feature_view(source):
    driver_stats_feature_view = FeatureView(
        name="driver_stats",
        entities=["driver_id"],
        features=[
            Feature(name="conv_rate", dtype=ValueType.FLOAT),
            Feature(name="acc_rate", dtype=ValueType.FLOAT),
            Feature(name="avg_daily_trips", dtype=ValueType.INT32),
        ],
        input=source,
        ttl=timedelta(hours=2),
    )
    return driver_stats_feature_view
Esempio n. 21
0
def create_driver_hourly_stats_feature_view(source):
    driver_stats_feature_view = FeatureView(
        name="driver_stats",
        entities=["driver_id"],
        schema=[
            Field(name="conv_rate", dtype=Float32),
            Field(name="acc_rate", dtype=Float32),
            Field(name="avg_daily_trips", dtype=Int32),
        ],
        source=source,
        ttl=timedelta(hours=2),
    )
    return driver_stats_feature_view
Esempio n. 22
0
def create_customer_daily_profile_feature_view(source):
    customer_profile_feature_view = FeatureView(
        name="customer_profile",
        entities=["customer_id"],
        features=[
            Feature(name="current_balance", dtype=ValueType.FLOAT),
            Feature(name="avg_passenger_count", dtype=ValueType.FLOAT),
            Feature(name="lifetime_trip_count", dtype=ValueType.INT32),
        ],
        input=source,
        ttl=timedelta(days=2),
    )
    return customer_profile_feature_view
Esempio n. 23
0
def test_feature_view_kw_args_normal():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    _ = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1", "feature2"]]]
    )
Esempio n. 24
0
def test_apply_feature_view_success(test_registry):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register Feature View
    test_registry.apply_feature_view(fv1, project)

    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == String
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == Array(String)
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == Array(Bytes)
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.delete_feature_view("my_feature_view_1", project)
    feature_views = test_registry.list_feature_views(project)
    assert len(feature_views) == 0

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Esempio n. 25
0
def test_apply_feature_view_integration(test_registry):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register Feature View
    test_registry.apply_feature_view(fv1, project)

    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == ValueType.INT64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == ValueType.STRING
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == ValueType.STRING_LIST
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == ValueType.INT64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == ValueType.STRING
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == ValueType.STRING_LIST
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == ValueType.BYTES_LIST
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.delete_feature_view("my_feature_view_1", project)
    feature_views = test_registry.list_feature_views(project)
    assert len(feature_views) == 0

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Esempio n. 26
0
    def test_bigquery_ingestion_correctness(self):
        # create dataset
        ts = pd.Timestamp.now(tz="UTC").round("ms")
        checked_value = (
            random.random()
        )  # random value so test doesn't still work if no values written to online store
        data = {
            "id": [1, 2, 1],
            "value": [0.1, 0.2, checked_value],
            "ts_1": [ts - timedelta(minutes=2), ts, ts],
            "created_ts": [ts, ts, ts],
        }
        df = pd.DataFrame.from_dict(data)

        # load dataset into BigQuery
        job_config = bigquery.LoadJobConfig()
        table_id = (
            f"{self.gcp_project}.{self.bigquery_dataset}.correctness_{int(time.time())}"
        )
        job = self.client.load_table_from_dataframe(df,
                                                    table_id,
                                                    job_config=job_config)
        job.result()

        # create FeatureView
        fv = FeatureView(
            name="test_bq_correctness",
            entities=["driver_id"],
            features=[Feature("value", ValueType.FLOAT)],
            ttl=timedelta(minutes=5),
            input=BigQuerySource(
                event_timestamp_column="ts",
                table_ref=table_id,
                created_timestamp_column="created_ts",
                field_mapping={
                    "ts_1": "ts",
                    "id": "driver_id"
                },
                date_partition_column="",
            ),
        )
        config = RepoConfig(
            metadata_store="./metadata.db",
            project="default",
            provider="gcp",
            online_store=OnlineStoreConfig(
                local=LocalOnlineStoreConfig("online_store.db")),
        )
        fs = FeatureStore(config=config)
        fs.apply([fv])

        # run materialize()
        fs.materialize(
            ["test_bq_correctness"],
            datetime.utcnow() - timedelta(minutes=5),
            datetime.utcnow() - timedelta(minutes=0),
        )

        # check result of materialize()
        entity_key = EntityKeyProto(entity_names=["driver_id"],
                                    entity_values=[ValueProto(int64_val=1)])
        t, val = fs._get_provider().online_read("default", fv, entity_key)
        assert abs(val["value"].double_val - checked_value) < 1e-6
Esempio n. 27
0
def test_hash():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    sources = [feature_view]
    on_demand_feature_view_1 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf1,
    )
    on_demand_feature_view_2 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf1,
    )
    on_demand_feature_view_3 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf2,
    )
    on_demand_feature_view_4 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf2,
        description="test",
    )

    s1 = {on_demand_feature_view_1, on_demand_feature_view_2}
    assert len(s1) == 1

    s2 = {on_demand_feature_view_1, on_demand_feature_view_3}
    assert len(s2) == 2

    s3 = {on_demand_feature_view_3, on_demand_feature_view_4}
    assert len(s3) == 2

    s4 = {
        on_demand_feature_view_1,
        on_demand_feature_view_2,
        on_demand_feature_view_3,
        on_demand_feature_view_4,
    }
    assert len(s4) == 3
def test_historical_features_from_bigquery_sources_containing_backfills(
        capsys):
    now = datetime.now().replace(microsecond=0, second=0, minute=0)
    tomorrow = now + timedelta(days=1)

    entity_dataframe = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": now + timedelta(days=2)
        },
        {
            "driver_id": 1002,
            "event_timestamp": now + timedelta(days=2)
        },
    ])

    driver_stats_df = pd.DataFrame(data=[
        # Duplicated rows simple case
        {
            "driver_id": 1001,
            "avg_daily_trips": 10,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1001,
            "avg_daily_trips": 20,
            "event_timestamp": tomorrow,
            "created": tomorrow,
        },
        # Duplicated rows after a backfill
        {
            "driver_id": 1002,
            "avg_daily_trips": 30,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1002,
            "avg_daily_trips": 40,
            "event_timestamp": tomorrow,
            "created": now,
        },
    ])

    expected_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": now + timedelta(days=2),
            "avg_daily_trips": 20,
        },
        {
            "driver_id": 1002,
            "event_timestamp": now + timedelta(days=2),
            "avg_daily_trips": 40,
        },
    ])

    bigquery_dataset = (
        f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}"
    )

    with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir:
        gcp_project = bigquery.Client().project

        # Entity Dataframe SQL query
        table_id = f"{bigquery_dataset}.orders"
        stage_orders_bigquery(entity_dataframe, table_id)
        entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}"

        # Driver Feature View
        driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly"
        stage_driver_hourly_stats_bigquery_source(driver_stats_df,
                                                  driver_table_id)

        store = FeatureStore(config=RepoConfig(
            registry=os.path.join(temp_dir, "registry.db"),
            project="".join(
                random.choices(string.ascii_uppercase + string.digits, k=10)),
            provider="gcp",
            offline_store=BigQueryOfflineStoreConfig(type="bigquery",
                                                     dataset=bigquery_dataset),
        ))

        driver = Entity(name="driver",
                        join_key="driver_id",
                        value_type=ValueType.INT64)
        driver_fv = FeatureView(
            name="driver_stats",
            entities=["driver"],
            features=[Feature(name="avg_daily_trips", dtype=ValueType.INT32)],
            batch_source=BigQuerySource(
                table_ref=driver_table_id,
                event_timestamp_column="event_timestamp",
                created_timestamp_column="created",
            ),
            ttl=None,
        )

        store.apply([driver, driver_fv])

        try:
            job_from_sql = store.get_historical_features(
                entity_df=entity_df_query,
                features=["driver_stats:avg_daily_trips"],
                full_feature_names=False,
            )

            start_time = datetime.utcnow()
            actual_df_from_sql_entities = job_from_sql.to_df()
            end_time = datetime.utcnow()
            with capsys.disabled():
                print(
                    str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'"
                        ))

            assert sorted(expected_df.columns) == sorted(
                actual_df_from_sql_entities.columns)
            assert_frame_equal(
                expected_df.sort_values(by=["driver_id"]).reset_index(
                    drop=True),
                actual_df_from_sql_entities[expected_df.columns].sort_values(
                    by=["driver_id"]).reset_index(drop=True),
                check_dtype=False,
            )

        finally:
            store.teardown()
Esempio n. 29
0
def test_historical_features_from_bigquery_sources_containing_backfills(
        environment):
    store = environment.feature_store

    now = datetime.now().replace(microsecond=0, second=0, minute=0)
    tomorrow = now + timedelta(days=1)
    day_after_tomorrow = now + timedelta(days=2)

    entity_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": day_after_tomorrow
        },
        {
            "driver_id": 1002,
            "event_timestamp": day_after_tomorrow
        },
    ])

    driver_stats_df = pd.DataFrame(data=[
        # Duplicated rows simple case
        {
            "driver_id": 1001,
            "avg_daily_trips": 10,
            "event_timestamp": now,
            "created": now,
        },
        {
            "driver_id": 1001,
            "avg_daily_trips": 20,
            "event_timestamp": now,
            "created": tomorrow,
        },
        # Duplicated rows after a backfill
        {
            "driver_id": 1002,
            "avg_daily_trips": 30,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1002,
            "avg_daily_trips": 40,
            "event_timestamp": tomorrow,
            "created": now,
        },
    ])

    expected_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": day_after_tomorrow,
            "avg_daily_trips": 20,
        },
        {
            "driver_id": 1002,
            "event_timestamp": day_after_tomorrow,
            "avg_daily_trips": 40,
        },
    ])

    driver_stats_data_source = environment.data_source_creator.create_data_source(
        df=driver_stats_df,
        destination_name=
        f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}",
        timestamp_field="event_timestamp",
        created_timestamp_column="created",
    )

    driver = Entity(name="driver",
                    join_keys=["driver_id"],
                    value_type=ValueType.INT64)
    driver_fv = FeatureView(
        name="driver_stats",
        entities=["driver"],
        schema=[Field(name="avg_daily_trips", dtype=Int32)],
        batch_source=driver_stats_data_source,
        ttl=None,
    )

    store.apply([driver, driver_fv])

    offline_job = store.get_historical_features(
        entity_df=entity_df,
        features=["driver_stats:avg_daily_trips"],
        full_feature_names=False,
    )

    start_time = datetime.utcnow()
    actual_df = offline_job.to_df()

    print(f"actual_df shape: {actual_df.shape}")
    end_time = datetime.utcnow()
    print(
        str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
            ))

    assert sorted(expected_df.columns) == sorted(actual_df.columns)
    assert_frame_equal(expected_df, actual_df, keys=["driver_id"])
Esempio n. 30
0
def test_modify_feature_views_success(test_registry, request_source_schema):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    request_source = RequestSource(
        name="request_source",
        schema=request_source_schema,
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[Field(name="fs1_my_feature_1", dtype=Int64)],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    @on_demand_feature_view(
        features=[
            Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING),
            Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32),
        ],
        sources=[request_source],
    )
    def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype(
            "category")
        data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32")
        return data

    project = "project"

    # Register Feature Views
    test_registry.apply_feature_view(odfv1, project)
    test_registry.apply_feature_view(fv1, project)

    # Modify odfv by changing a single feature dtype
    @on_demand_feature_view(
        features=[
            Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT),
            Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32),
        ],
        sources=[request_source],
    )
    def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("float")
        data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32")
        return data

    # Apply the modified odfv
    test_registry.apply_feature_view(odfv1, project)

    # Check odfv
    on_demand_feature_views = test_registry.list_on_demand_feature_views(
        project)

    assert (
        len(on_demand_feature_views) == 1
        and on_demand_feature_views[0].name == "odfv1"
        and on_demand_feature_views[0].features[0].name == "odfv1_my_feature_1"
        and on_demand_feature_views[0].features[0].dtype == Float32
        and on_demand_feature_views[0].features[1].name == "odfv1_my_feature_2"
        and on_demand_feature_views[0].features[1].dtype == Int32)
    request_schema = on_demand_feature_views[0].get_request_data_schema()
    assert (list(request_schema.keys())[0] == "my_input_1"
            and list(request_schema.values())[0] == ValueType.INT32)

    feature_view = test_registry.get_on_demand_feature_view("odfv1", project)
    assert (feature_view.name == "odfv1"
            and feature_view.features[0].name == "odfv1_my_feature_1"
            and feature_view.features[0].dtype == Float32
            and feature_view.features[1].name == "odfv1_my_feature_2"
            and feature_view.features[1].dtype == Int32)
    request_schema = feature_view.get_request_data_schema()
    assert (list(request_schema.keys())[0] == "my_input_1"
            and list(request_schema.values())[0] == ValueType.INT32)

    # Make sure fv1 is untouched
    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()