Example #1
0
def test_apply_object_and_read(test_feature_store):
    assert isinstance(test_feature_store, FeatureStore)
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    e1 = Entity(name="fs1_my_entity_1",
                value_type=ValueType.STRING,
                description="something")

    e2 = Entity(name="fs1_my_entity_2",
                value_type=ValueType.STRING,
                description="something")

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    fv2 = FeatureView(
        name="my_feature_view_2",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1, e1, fv2, e2])

    fv1_actual = test_feature_store.get_feature_view("my_feature_view_1")
    e1_actual = test_feature_store.get_entity("fs1_my_entity_1")

    assert fv1 == fv1_actual
    assert e1 == e1_actual
    assert fv2 != fv1_actual
    assert e2 != e1_actual

    test_feature_store.teardown()
Example #2
0
def test_update_feature_views_with_inferred_features():
    file_source = FileSource(name="test", path="test path")
    entity1 = Entity(name="test1", join_keys=["test_column_1"])
    entity2 = Entity(name="test2", join_keys=["test_column_2"])
    feature_view_1 = FeatureView(
        name="test1",
        entities=[entity1],
        schema=[
            Field(name="feature", dtype=Float32),
            Field(name="test_column_1", dtype=String),
        ],
        source=file_source,
    )
    feature_view_2 = FeatureView(
        name="test2",
        entities=[entity1, entity2],
        schema=[
            Field(name="feature", dtype=Float32),
            Field(name="test_column_1", dtype=String),
            Field(name="test_column_2", dtype=String),
        ],
        source=file_source,
    )

    assert len(feature_view_1.schema) == 2
    assert len(feature_view_1.features) == 2

    # The entity field should be deleted from the schema and features of the feature view.
    update_feature_views_with_inferred_features([feature_view_1], [entity1],
                                                RepoConfig(provider="local",
                                                           project="test"))
    assert len(feature_view_1.schema) == 1
    assert len(feature_view_1.features) == 1

    assert len(feature_view_2.schema) == 3
    assert len(feature_view_2.features) == 3

    # The entity fields should be deleted from the schema and features of the feature view.
    update_feature_views_with_inferred_features(
        [feature_view_2],
        [entity1, entity2],
        RepoConfig(provider="local", project="test"),
    )
    assert len(feature_view_2.schema) == 1
    assert len(feature_view_2.features) == 1
Example #3
0
def test_apply_feature_view_success(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    test_feature_store.teardown()
Example #4
0
    def __init__(
        self,
        *args,
        name: Optional[str] = None,
        schema: Optional[Union[Dict[str, ValueType], List[Field]]] = None,
        description: Optional[str] = "",
        tags: Optional[Dict[str, str]] = None,
        owner: Optional[str] = "",
    ):
        """Creates a RequestSource object."""
        positional_attributes = ["name", "schema"]
        _name = name
        _schema = schema
        if args:
            warnings.warn(
                ("Request source parameters should be specified as a keyword argument instead of a positional arg."
                 "Feast 0.23+ will not support positional arguments to construct request sources"
                 ),
                DeprecationWarning,
            )
            if len(args) > len(positional_attributes):
                raise ValueError(
                    f"Only {', '.join(positional_attributes)} are allowed as positional args when defining "
                    f"feature views, for backwards compatibility.")
            if len(args) >= 1:
                _name = args[0]
            if len(args) >= 2:
                _schema = args[1]

        super().__init__(name=_name,
                         description=description,
                         tags=tags,
                         owner=owner)
        if not _schema:
            raise ValueError("Schema needs to be provided for Request Source")
        if isinstance(_schema, Dict):
            warnings.warn(
                "Schema in RequestSource is changing type. The schema data type Dict[str, ValueType] is being deprecated in Feast 0.23. "
                "Please use List[Field] instead for the schema",
                DeprecationWarning,
            )
            schemaList = []
            for key, valueType in _schema.items():
                schemaList.append(
                    Field(name=key,
                          dtype=VALUE_TYPES_TO_FEAST_TYPES[valueType]))
            self.schema = schemaList
        elif isinstance(_schema, List):
            self.schema = _schema
        else:
            raise Exception(
                "Schema type must be either dictionary or list, not " +
                str(type(_schema)))
Example #5
0
def test_hash():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    feature_service_1 = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1", "feature2"]]]
    )
    feature_service_2 = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1", "feature2"]]]
    )
    feature_service_3 = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1"]]]
    )
    feature_service_4 = FeatureService(
        name="my-feature-service",
        features=[feature_view[["feature1"]]],
        description="test",
    )

    s1 = {feature_service_1, feature_service_2}
    assert len(s1) == 1

    s2 = {feature_service_1, feature_service_3}
    assert len(s2) == 2

    s3 = {feature_service_3, feature_service_4}
    assert len(s3) == 2

    s4 = {feature_service_1, feature_service_2, feature_service_3, feature_service_4}
    assert len(s4) == 3
Example #6
0
    def infer_features(self):
        """
        Infers the set of features associated to this feature view from the input source.

        Raises:
            RegistryInferenceFailure: The set of features could not be inferred.
        """
        df = pd.DataFrame()
        for feature_view_projection in self.source_feature_view_projections.values(
        ):
            for feature in feature_view_projection.features:
                dtype = feast_value_type_to_pandas_type(
                    feature.dtype.to_value_type())
                df[f"{feature_view_projection.name}__{feature.name}"] = pd.Series(
                    dtype=dtype)
                df[f"{feature.name}"] = pd.Series(dtype=dtype)
        for request_data in self.source_request_sources.values():
            for field in request_data.schema:
                dtype = feast_value_type_to_pandas_type(
                    field.dtype.to_value_type())
                df[f"{field.name}"] = pd.Series(dtype=dtype)
        output_df: pd.DataFrame = self.udf.__call__(df)
        inferred_features = []
        for f, dt in zip(output_df.columns, output_df.dtypes):
            inferred_features.append(
                Field(
                    name=f,
                    dtype=from_value_type(
                        python_type_to_feast_value_type(f, type_name=str(dt))),
                ))

        if self.features:
            missing_features = []
            for specified_features in self.features:
                if specified_features not in inferred_features:
                    missing_features.append(specified_features)
            if missing_features:
                raise SpecifiedFeaturesNotPresentError(
                    [f.name for f in missing_features], self.name)
        else:
            self.features = inferred_features

        if not self.features:
            raise RegistryInferenceFailure(
                "OnDemandFeatureView",
                f"Could not infer Features for the feature view '{self.name}'.",
            )
Example #7
0
    def __init__(
        self,
        name: str,
        request_data_source: RequestSource,
        description: str = "",
        tags: Optional[Dict[str, str]] = None,
        owner: str = "",
    ):
        """
        Creates a RequestFeatureView object.

        Args:
            name: The unique name of the request feature view.
            request_data_source: The request data source that specifies the schema and
                features of the request feature view.
            description (optional): A human-readable description.
            tags (optional): A dictionary of key-value pairs to store arbitrary metadata.
            owner (optional): The owner of the request feature view, typically the email
                of the primary maintainer.
        """
        warnings.warn(
            "Request feature view is deprecated. "
            "Please use request data source instead",
            DeprecationWarning,
        )

        if isinstance(request_data_source.schema, Dict):
            new_features = [
                Field(name=name, dtype=dtype)
                for name, dtype in request_data_source.schema.items()
            ]
        else:
            new_features = request_data_source.schema

        super().__init__(
            name=name,
            features=new_features,
            description=description,
            tags=tags,
            owner=owner,
        )
        self.request_source = request_data_source
Example #8
0
    def to_proto(self) -> DataSourceProto:

        schema_pb = []

        if isinstance(self.schema, Dict):
            for key, value in self.schema.items():
                schema_pb.append(
                    Field(name=key,
                          dtype=VALUE_TYPES_TO_FEAST_TYPES[
                              value.value]).to_proto())
        else:
            for field in self.schema:
                schema_pb.append(field.to_proto())
        data_source_proto = DataSourceProto(
            name=self.name,
            type=DataSourceProto.REQUEST_SOURCE,
            description=self.description,
            tags=self.tags,
            owner=self.owner,
        )
        data_source_proto.request_data_options.schema.extend(schema_pb)

        return data_source_proto
Example #9
0
def update_feature_views_with_inferred_features(fvs: List[FeatureView],
                                                entities: List[Entity],
                                                config: RepoConfig) -> None:
    """
    Infers the set of features associated to each FeatureView and updates the FeatureView with those features.
    Inference occurs through considering each column of the underlying data source as a feature except columns that are
    associated with the data source's timestamp columns and the FeatureView's entity columns.

    Args:
        fvs: The feature views to be updated.
        entities: A list containing entities associated with the feature views.
        config: The config for the current feature store.
    """
    entity_name_to_join_key_map = {
        entity.name: entity.join_key
        for entity in entities
    }
    join_keys = entity_name_to_join_key_map.values()

    for fv in fvs:
        # First drop all Entity fields. Then infer features if necessary.
        fv.schema = [
            field for field in fv.schema if field.name not in join_keys
        ]
        fv.features = [
            field for field in fv.features if field.name not in join_keys
        ]

        if not fv.features:
            columns_to_exclude = {
                fv.batch_source.timestamp_field,
                fv.batch_source.created_timestamp_column,
            } | {
                entity_name_to_join_key_map[entity_name]
                for entity_name in fv.entities
            }

            if fv.batch_source.timestamp_field in fv.batch_source.field_mapping:
                columns_to_exclude.add(fv.batch_source.field_mapping[
                    fv.batch_source.timestamp_field])
            if (fv.batch_source.created_timestamp_column
                    in fv.batch_source.field_mapping):
                columns_to_exclude.add(fv.batch_source.field_mapping[
                    fv.batch_source.created_timestamp_column])

            for (
                    col_name,
                    col_datatype,
            ) in fv.batch_source.get_table_column_names_and_types(config):
                if col_name not in columns_to_exclude and not re.match(
                        "^__|__$",
                        col_name,  # double underscores often signal an internal-use column
                ):
                    feature_name = (fv.batch_source.field_mapping[col_name] if
                                    col_name in fv.batch_source.field_mapping
                                    else col_name)
                    field = Field(
                        name=feature_name,
                        dtype=from_value_type(
                            fv.batch_source.
                            source_datatype_to_feast_value_type()(
                                col_datatype)),
                    )
                    # Note that schema and features are two different attributes of a
                    # FeatureView, and that features should be present in both.
                    fv.schema.append(field)
                    fv.features.append(field)

            if not fv.features:
                raise RegistryInferenceFailure(
                    "FeatureView",
                    f"Could not infer Features for the FeatureView named {fv.name}.",
                )
Example #10
0
def test_default_data_source_kw_arg_warning():
    # source_class = request.param
    with pytest.warns(DeprecationWarning):
        source = KafkaSource(
            "name", "column", "bootstrap_servers", ProtoFormat("class_path"), "topic"
        )
        assert source.name == "name"
        assert source.timestamp_field == "column"
        assert source.kafka_options.bootstrap_servers == "bootstrap_servers"
        assert source.kafka_options.topic == "topic"
    with pytest.raises(ValueError):
        KafkaSource("name", "column", "bootstrap_servers", topic="topic")

    with pytest.warns(DeprecationWarning):
        source = KinesisSource(
            "name",
            "column",
            "c_column",
            ProtoFormat("class_path"),
            "region",
            "stream_name",
        )
        assert source.name == "name"
        assert source.timestamp_field == "column"
        assert source.created_timestamp_column == "c_column"
        assert source.kinesis_options.region == "region"
        assert source.kinesis_options.stream_name == "stream_name"

    with pytest.raises(ValueError):
        KinesisSource(
            "name", "column", "c_column", region="region", stream_name="stream_name"
        )

    with pytest.warns(DeprecationWarning):
        source = RequestSource(
            "name", [Field(name="val_to_add", dtype=Int64)], description="description"
        )
        assert source.name == "name"
        assert source.description == "description"

    with pytest.raises(ValueError):
        RequestSource("name")

    with pytest.warns(DeprecationWarning):
        source = PushSource(
            "name",
            BigQuerySource(name="bigquery_source", table="table"),
            description="description",
        )
        assert source.name == "name"
        assert source.description == "description"
        assert source.batch_source.name == "bigquery_source"

    with pytest.raises(ValueError):
        PushSource("name")

    # No name warning for DataSource
    with pytest.warns(UserWarning):
        source = KafkaSource(
            event_timestamp_column="column",
            bootstrap_servers="bootstrap_servers",
            message_format=ProtoFormat("class_path"),
            topic="topic",
        )
Example #11
0
def test_hash():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    sources = [feature_view]
    on_demand_feature_view_1 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf1,
    )
    on_demand_feature_view_2 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf1,
    )
    on_demand_feature_view_3 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf2,
    )
    on_demand_feature_view_4 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf2,
        description="test",
    )

    s1 = {on_demand_feature_view_1, on_demand_feature_view_2}
    assert len(s1) == 1

    s2 = {on_demand_feature_view_1, on_demand_feature_view_3}
    assert len(s2) == 2

    s3 = {on_demand_feature_view_3, on_demand_feature_view_4}
    assert len(s3) == 2

    s4 = {
        on_demand_feature_view_1,
        on_demand_feature_view_2,
        on_demand_feature_view_3,
        on_demand_feature_view_4,
    }
    assert len(s4) == 3
Example #12
0
def test_inputs_parameter_deprecation_in_odfv():
    date_request = RequestSource(
        name="date_request",
        schema=[Field(name="some_date", dtype=UnixTimestamp)],
    )
    with pytest.warns(DeprecationWarning):

        @on_demand_feature_view(
            inputs={"date_request": date_request},
            schema=[
                Field(name="output", dtype=UnixTimestamp),
                Field(name="string_output", dtype=String),
            ],
        )
        def test_view(features_df: pd.DataFrame) -> pd.DataFrame:
            data = pd.DataFrame()
            data["output"] = features_df["some_date"]
            data["string_output"] = features_df["some_date"].astype(
                pd.StringDtype())
            return data

    odfv = test_view
    assert odfv.name == "test_view"
    assert len(odfv.source_request_sources) == 1
    assert odfv.source_request_sources["date_request"].name == "date_request"
    assert odfv.source_request_sources[
        "date_request"].schema == date_request.schema

    with pytest.raises(ValueError):

        @on_demand_feature_view(
            inputs={"date_request": date_request},
            sources=[date_request],
            schema=[
                Field(name="output", dtype=UnixTimestamp),
                Field(name="string_output", dtype=String),
            ],
        )
        def incorrect_testview(features_df: pd.DataFrame) -> pd.DataFrame:
            data = pd.DataFrame()
            data["output"] = features_df["some_date"]
            data["string_output"] = features_df["some_date"].astype(
                pd.StringDtype())
            return data

    @on_demand_feature_view(
        inputs={"odfv": date_request},
        schema=[
            Field(name="output", dtype=UnixTimestamp),
            Field(name="string_output", dtype=String),
        ],
    )
    def test_correct_view(features_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["output"] = features_df["some_date"]
        data["string_output"] = features_df["some_date"].astype(
            pd.StringDtype())
        return data

    odfv = test_correct_view
    assert odfv.name == "test_correct_view"
    assert odfv.source_request_sources[
        "date_request"].schema == date_request.schema
Example #13
0
def test_modify_feature_views_success(test_registry, request_source_schema):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    request_source = RequestSource(
        name="request_source",
        schema=request_source_schema,
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[Field(name="fs1_my_feature_1", dtype=Int64)],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    @on_demand_feature_view(
        features=[
            Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING),
            Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32),
        ],
        sources=[request_source],
    )
    def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype(
            "category")
        data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32")
        return data

    project = "project"

    # Register Feature Views
    test_registry.apply_feature_view(odfv1, project)
    test_registry.apply_feature_view(fv1, project)

    # Modify odfv by changing a single feature dtype
    @on_demand_feature_view(
        features=[
            Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT),
            Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32),
        ],
        sources=[request_source],
    )
    def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("float")
        data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32")
        return data

    # Apply the modified odfv
    test_registry.apply_feature_view(odfv1, project)

    # Check odfv
    on_demand_feature_views = test_registry.list_on_demand_feature_views(
        project)

    assert (
        len(on_demand_feature_views) == 1
        and on_demand_feature_views[0].name == "odfv1"
        and on_demand_feature_views[0].features[0].name == "odfv1_my_feature_1"
        and on_demand_feature_views[0].features[0].dtype == Float32
        and on_demand_feature_views[0].features[1].name == "odfv1_my_feature_2"
        and on_demand_feature_views[0].features[1].dtype == Int32)
    request_schema = on_demand_feature_views[0].get_request_data_schema()
    assert (list(request_schema.keys())[0] == "my_input_1"
            and list(request_schema.values())[0] == ValueType.INT32)

    feature_view = test_registry.get_on_demand_feature_view("odfv1", project)
    assert (feature_view.name == "odfv1"
            and feature_view.features[0].name == "odfv1_my_feature_1"
            and feature_view.features[0].dtype == Float32
            and feature_view.features[1].name == "odfv1_my_feature_2"
            and feature_view.features[1].dtype == Int32)
    request_schema = feature_view.get_request_data_schema()
    assert (list(request_schema.keys())[0] == "my_input_1"
            and list(request_schema.values())[0] == ValueType.INT32)

    # Make sure fv1 is untouched
    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Example #14
0
    )
    def test_view_with_missing_feature(
            features_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["output"] = features_df["some_date"]
        return data

    with pytest.raises(SpecifiedFeaturesNotPresentError):
        test_view_with_missing_feature.infer_features()


# TODO(kevjumba): remove this in feast 0.23 when deprecating
@pytest.mark.parametrize(
    "request_source_schema",
    [
        [Field(name="some_date", dtype=UnixTimestamp)],
        {
            "some_date": ValueType.UNIX_TIMESTAMP
        },
    ],
)
def test_datasource_inference(request_source_schema):
    # Create Feature Views
    date_request = RequestSource(
        name="date_request",
        schema=request_source_schema,
    )

    @on_demand_feature_view(
        # Note: we deliberately use positional arguments here to test that they work correctly,
        # even though positional arguments are deprecated in favor of keyword arguments.
Example #15
0
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 7),
    schema=[
        Field(name="conv_rate", dtype=Float64),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

input_request = RequestSource(
    name="vals_to_add",
    schema=[
        Field(name="val_to_add", dtype=Int64),
        Field(name="val_to_add_2", dtype=Int64),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
Example #16
0
def test_historical_features_from_bigquery_sources_containing_backfills(
        environment):
    store = environment.feature_store

    now = datetime.now().replace(microsecond=0, second=0, minute=0)
    tomorrow = now + timedelta(days=1)
    day_after_tomorrow = now + timedelta(days=2)

    entity_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": day_after_tomorrow
        },
        {
            "driver_id": 1002,
            "event_timestamp": day_after_tomorrow
        },
    ])

    driver_stats_df = pd.DataFrame(data=[
        # Duplicated rows simple case
        {
            "driver_id": 1001,
            "avg_daily_trips": 10,
            "event_timestamp": now,
            "created": now,
        },
        {
            "driver_id": 1001,
            "avg_daily_trips": 20,
            "event_timestamp": now,
            "created": tomorrow,
        },
        # Duplicated rows after a backfill
        {
            "driver_id": 1002,
            "avg_daily_trips": 30,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1002,
            "avg_daily_trips": 40,
            "event_timestamp": tomorrow,
            "created": now,
        },
    ])

    expected_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": day_after_tomorrow,
            "avg_daily_trips": 20,
        },
        {
            "driver_id": 1002,
            "event_timestamp": day_after_tomorrow,
            "avg_daily_trips": 40,
        },
    ])

    driver_stats_data_source = environment.data_source_creator.create_data_source(
        df=driver_stats_df,
        destination_name=
        f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}",
        timestamp_field="event_timestamp",
        created_timestamp_column="created",
    )

    driver = Entity(name="driver",
                    join_keys=["driver_id"],
                    value_type=ValueType.INT64)
    driver_fv = FeatureView(
        name="driver_stats",
        entities=["driver"],
        schema=[Field(name="avg_daily_trips", dtype=Int32)],
        batch_source=driver_stats_data_source,
        ttl=None,
    )

    store.apply([driver, driver_fv])

    offline_job = store.get_historical_features(
        entity_df=entity_df,
        features=["driver_stats:avg_daily_trips"],
        full_feature_names=False,
    )

    start_time = datetime.utcnow()
    actual_df = offline_job.to_df()

    print(f"actual_df shape: {actual_df.shape}")
    end_time = datetime.utcnow()
    print(
        str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
            ))

    assert sorted(expected_df.columns) == sorted(actual_df.columns)
    assert_frame_equal(expected_df, actual_df, keys=["driver_id"])
Example #17
0
    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()


@pytest.mark.parametrize(
    "test_registry",
    [lazy_fixture("local_registry")],
)
# TODO(kevjumba): remove this in feast 0.23 when deprecating
@pytest.mark.parametrize(
    "request_source_schema",
    [[Field(name="my_input_1", dtype=Int32)], {
        "my_input_1": ValueType.INT32
    }],
)
def test_modify_feature_views_success(test_registry, request_source_schema):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    request_source = RequestSource(
        name="request_source",
        schema=request_source_schema,
Example #18
0
    def from_proto(cls,
                   on_demand_feature_view_proto: OnDemandFeatureViewProto):
        """
        Creates an on demand feature view from a protobuf representation.

        Args:
            on_demand_feature_view_proto: A protobuf representation of an on-demand feature view.

        Returns:
            A OnDemandFeatureView object based on the on-demand feature view protobuf.
        """
        sources = []
        for (
                _,
                on_demand_source,
        ) in on_demand_feature_view_proto.spec.sources.items():
            if on_demand_source.WhichOneof("source") == "feature_view":
                sources.append(
                    FeatureView.from_proto(
                        on_demand_source.feature_view).projection)
            elif on_demand_source.WhichOneof(
                    "source") == "feature_view_projection":
                sources.append(
                    FeatureViewProjection.from_proto(
                        on_demand_source.feature_view_projection))
            else:
                sources.append(
                    RequestSource.from_proto(
                        on_demand_source.request_data_source))
        on_demand_feature_view_obj = cls(
            name=on_demand_feature_view_proto.spec.name,
            schema=[
                Field(
                    name=feature.name,
                    dtype=from_value_type(ValueType(feature.value_type)),
                ) for feature in on_demand_feature_view_proto.spec.features
            ],
            sources=sources,
            udf=dill.loads(
                on_demand_feature_view_proto.spec.user_defined_function.body),
            description=on_demand_feature_view_proto.spec.description,
            tags=dict(on_demand_feature_view_proto.spec.tags),
            owner=on_demand_feature_view_proto.spec.owner,
        )

        # FeatureViewProjections are not saved in the OnDemandFeatureView proto.
        # Create the default projection.
        on_demand_feature_view_obj.projection = FeatureViewProjection.from_definition(
            on_demand_feature_view_obj)

        if on_demand_feature_view_proto.meta.HasField("created_timestamp"):
            on_demand_feature_view_obj.created_timestamp = (
                on_demand_feature_view_proto.meta.created_timestamp.ToDatetime(
                ))
        if on_demand_feature_view_proto.meta.HasField(
                "last_updated_timestamp"):
            on_demand_feature_view_obj.last_updated_timestamp = (
                on_demand_feature_view_proto.meta.last_updated_timestamp.
                ToDatetime())

        return on_demand_feature_view_obj
Example #19
0
def test_apply_feature_view_integration(test_registry):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register Feature View
    test_registry.apply_feature_view(fv1, project)

    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == String
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == Array(String)
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == Array(Bytes)
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.delete_feature_view("my_feature_view_1", project)
    feature_views = test_registry.list_feature_views(project)
    assert len(feature_views) == 0

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Example #20
0
driver_hourly_stats = FileSource(
    path="data/driver_stats_with_string.parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
)
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)
driver_hourly_stats_view = BatchFeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400000),
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
        Field(name="string_feature", dtype=String),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

# Define a request data source which encodes features / information only
# available at request time (e.g. part of the user initiated HTTP request)
input_request = RequestSource(
    name="vals_to_add",
    schema=[
        Field(name="val_to_add", dtype=Int64),