コード例 #1
0
def test_apply_feature_view_success(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    test_feature_store.teardown()
コード例 #2
0
def test_apply_feature_view_success(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        input=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (
        len(feature_views) == 1
        and feature_views[0].name == "my_feature_view_1"
        and feature_views[0].features[0].name == "fs1_my_feature_1"
        and feature_views[0].features[0].dtype == ValueType.INT64
        and feature_views[0].features[1].name == "fs1_my_feature_2"
        and feature_views[0].features[1].dtype == ValueType.STRING
        and feature_views[0].features[2].name == "fs1_my_feature_3"
        and feature_views[0].features[2].dtype == ValueType.STRING_LIST
        and feature_views[0].features[3].name == "fs1_my_feature_4"
        and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
        and feature_views[0].entities[0] == "fs1_my_entity_1"
    )
コード例 #3
0
    def get_feature_view(self, name: str, project: str) -> FeatureView:
        """
        Retrieves a feature view.

        Args:
            name: Name of feature view
            project: Feast project that this feature view belongs to

        Returns:
            Returns either the specified feature view, or raises an exception if
            none is found
        """
        registry_proto = self._get_registry_proto()
        for feature_view_proto in registry_proto.feature_views:
            if (feature_view_proto.spec.name == name
                    and feature_view_proto.spec.project == project):
                return FeatureView.from_proto(feature_view_proto)
        raise Exception(
            f"Feature view {name} does not exist in project {project}")
コード例 #4
0
ファイル: registry.py プロジェクト: terryyylim/feast
    def list_feature_views(
        self, project: str, allow_cache: bool = False
    ) -> List[FeatureView]:
        """
        Retrieve a list of feature views from the registry

        Args:
            allow_cache: Allow returning feature views from the cached registry
            project: Filter feature tables based on project name

        Returns:
            List of feature views
        """
        registry_proto = self._get_registry_proto(allow_cache=allow_cache)
        feature_views = []
        for feature_view_proto in registry_proto.feature_views:
            if feature_view_proto.spec.project == project:
                feature_views.append(FeatureView.from_proto(feature_view_proto))
        return feature_views
コード例 #5
0
    def from_proto(cls,
                   on_demand_feature_view_proto: OnDemandFeatureViewProto):
        """
        Creates an on demand feature view from a protobuf representation.

        Args:
            on_demand_feature_view_proto: A protobuf representation of an on-demand feature view.

        Returns:
            A OnDemandFeatureView object based on the on-demand feature view protobuf.
        """
        inputs = {}
        for (
                input_name,
                on_demand_input,
        ) in on_demand_feature_view_proto.spec.inputs.items():
            if on_demand_input.WhichOneof("input") == "feature_view":
                inputs[input_name] = FeatureView.from_proto(
                    on_demand_input.feature_view)
            else:
                inputs[input_name] = RequestDataSource.from_proto(
                    on_demand_input.request_data_source)
        on_demand_feature_view_obj = cls(
            name=on_demand_feature_view_proto.spec.name,
            features=[
                Feature(
                    name=feature.name,
                    dtype=ValueType(feature.value_type),
                    labels=dict(feature.labels),
                ) for feature in on_demand_feature_view_proto.spec.features
            ],
            inputs=inputs,
            udf=dill.loads(
                on_demand_feature_view_proto.spec.user_defined_function.body),
        )

        # FeatureViewProjections are not saved in the OnDemandFeatureView proto.
        # Create the default projection.
        on_demand_feature_view_obj.projection = FeatureViewProjection.from_definition(
            on_demand_feature_view_obj)

        return on_demand_feature_view_obj
コード例 #6
0
 def updater(registry_proto: RegistryProto):
     for idx, existing_feature_view_proto in enumerate(
         registry_proto.feature_views
     ):
         if (
             existing_feature_view_proto.spec.name == feature_view.name
             and existing_feature_view_proto.spec.project == project
         ):
             existing_feature_view = FeatureView.from_proto(
                 existing_feature_view_proto
             )
             existing_feature_view.materialization_intervals.append(
                 (start_date, end_date)
             )
             feature_view_proto = existing_feature_view.to_proto()
             feature_view_proto.spec.project = project
             del registry_proto.feature_views[idx]
             registry_proto.feature_views.append(feature_view_proto)
             return registry_proto
     raise FeatureViewNotFoundException(feature_view.name, project)
コード例 #7
0
    def apply_materialization(
        self,
        feature_view: FeatureView,
        project: str,
        start_date: datetime,
        end_date: datetime,
        commit: bool = True,
    ):
        """
        Updates materialization intervals tracked for a single feature view in Feast

        Args:
            feature_view: Feature view that will be updated with an additional materialization interval tracked
            project: Feast project that this feature view belongs to
            start_date (datetime): Start date of the materialization interval to track
            end_date (datetime): End date of the materialization interval to track
            commit: Whether the change should be persisted immediately
        """
        self._prepare_registry_for_changes()
        assert self.cached_registry_proto

        for idx, existing_feature_view_proto in enumerate(
                self.cached_registry_proto.feature_views):
            if (existing_feature_view_proto.spec.name == feature_view.name
                    and existing_feature_view_proto.spec.project == project):
                existing_feature_view = FeatureView.from_proto(
                    existing_feature_view_proto)
                existing_feature_view.materialization_intervals.append(
                    (start_date, end_date))
                existing_feature_view.last_updated_timestamp = datetime.utcnow(
                )
                feature_view_proto = existing_feature_view.to_proto()
                feature_view_proto.spec.project = project
                del self.cached_registry_proto.feature_views[idx]
                self.cached_registry_proto.feature_views.append(
                    feature_view_proto)
                if commit:
                    self.commit()
                return

        raise FeatureViewNotFoundException(feature_view.name, project)
コード例 #8
0
 def updater(registry_proto: RegistryProto):
     for idx, existing_feature_view_proto in enumerate(
         registry_proto.feature_views
     ):
         if (
             existing_feature_view_proto.spec.name
             == feature_view_proto.spec.name
             and existing_feature_view_proto.spec.project == project
         ):
             # do not update if feature view has not changed; updating will erase tracked materialization intervals
             if (
                 FeatureView.from_proto(existing_feature_view_proto)
                 == feature_view
             ):
                 return registry_proto
             else:
                 del registry_proto.feature_views[idx]
                 registry_proto.feature_views.append(feature_view_proto)
                 return registry_proto
     registry_proto.feature_views.append(feature_view_proto)
     return registry_proto
コード例 #9
0
    def _localize_feature_view(self, feature_view: FeatureView):
        """
        This function ensures that the `FeatureView` object points to files in the local disk
        """
        if not isinstance(feature_view.batch_source, FileSource):
            return

        # Copy parquet file to a local file
        file_source: FileSource = feature_view.batch_source
        random_local_path = (
            FlyteContext.current_context().file_access.get_random_local_path(
                file_source.path))
        FlyteContext.current_context().file_access.get_data(
            file_source.path,
            random_local_path,
            is_multipart=True,
        )
        feature_view.batch_source = FileSource(
            path=random_local_path,
            event_timestamp_column=file_source.event_timestamp_column,
        )
コード例 #10
0
ファイル: registry.py プロジェクト: Shopify/feast
    def get_feature_view(self,
                         name: str,
                         project: str,
                         allow_cache: bool = False) -> FeatureView:
        """
        Retrieves a feature view.

        Args:
            name: Name of feature view
            project: Feast project that this feature view belongs to
            allow_cache: Allow returning feature view from the cached registry

        Returns:
            Returns either the specified feature view, or raises an exception if
            none is found
        """
        registry_proto = self._get_registry_proto(allow_cache=allow_cache)
        for feature_view_proto in registry_proto.feature_views:
            if (feature_view_proto.spec.name == name
                    and feature_view_proto.spec.project == project):
                return FeatureView.from_proto(feature_view_proto)
        raise FeatureViewNotFoundException(name, project)
コード例 #11
0
def test_hash():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    feature_service_1 = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1", "feature2"]]]
    )
    feature_service_2 = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1", "feature2"]]]
    )
    feature_service_3 = FeatureService(
        name="my-feature-service", features=[feature_view[["feature1"]]]
    )
    feature_service_4 = FeatureService(
        name="my-feature-service",
        features=[feature_view[["feature1"]]],
        description="test",
    )

    s1 = {feature_service_1, feature_service_2}
    assert len(s1) == 1

    s2 = {feature_service_1, feature_service_3}
    assert len(s2) == 2

    s3 = {feature_service_3, feature_service_4}
    assert len(s3) == 2

    s4 = {feature_service_1, feature_service_2, feature_service_3, feature_service_4}
    assert len(s4) == 3
コード例 #12
0
ファイル: test_feature_store.py プロジェクト: baineng/feast
def test_feature_view_inference_success(test_feature_store, dataframe_source):
    with prep_file_source(
        df=dataframe_source, event_timestamp_column="ts_1"
    ) as file_source:
        entity = Entity(name="id", join_key="id_join_key", value_type=ValueType.INT64)

        fv1 = FeatureView(
            name="fv1",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=file_source,
            tags={},
        )

        fv2 = FeatureView(
            name="fv2",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=simple_bq_source_using_table_ref_arg(dataframe_source, "ts_1"),
            tags={},
        )

        fv3 = FeatureView(
            name="fv3",
            entities=["id"],
            ttl=timedelta(minutes=5),
            online=True,
            batch_source=simple_bq_source_using_query_arg(dataframe_source, "ts_1"),
            tags={},
        )

        test_feature_store.apply([entity, fv1, fv2, fv3])  # Register Feature Views
        feature_view_1 = test_feature_store.list_feature_views()[0]
        feature_view_2 = test_feature_store.list_feature_views()[1]
        feature_view_3 = test_feature_store.list_feature_views()[2]

        actual_file_source = {
            (feature.name, feature.dtype) for feature in feature_view_1.features
        }
        actual_bq_using_table_ref_arg_source = {
            (feature.name, feature.dtype) for feature in feature_view_2.features
        }
        actual_bq_using_query_arg_source = {
            (feature.name, feature.dtype) for feature in feature_view_3.features
        }
        expected = {
            ("float_col", ValueType.DOUBLE),
            ("int64_col", ValueType.INT64),
            ("string_col", ValueType.STRING),
        }

        assert (
            expected
            == actual_file_source
            == actual_bq_using_table_ref_arg_source
            == actual_bq_using_query_arg_source
        )

        test_feature_store.teardown()
コード例 #13
0
def test_modify_feature_views_success(test_registry, request_source_schema):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    request_source = RequestSource(
        name="request_source",
        schema=request_source_schema,
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[Field(name="fs1_my_feature_1", dtype=Int64)],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    @on_demand_feature_view(
        features=[
            Feature(name="odfv1_my_feature_1", dtype=ValueType.STRING),
            Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32),
        ],
        sources=[request_source],
    )
    def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype(
            "category")
        data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32")
        return data

    project = "project"

    # Register Feature Views
    test_registry.apply_feature_view(odfv1, project)
    test_registry.apply_feature_view(fv1, project)

    # Modify odfv by changing a single feature dtype
    @on_demand_feature_view(
        features=[
            Feature(name="odfv1_my_feature_1", dtype=ValueType.FLOAT),
            Feature(name="odfv1_my_feature_2", dtype=ValueType.INT32),
        ],
        sources=[request_source],
    )
    def odfv1(feature_df: pd.DataFrame) -> pd.DataFrame:
        data = pd.DataFrame()
        data["odfv1_my_feature_1"] = feature_df["my_input_1"].astype("float")
        data["odfv1_my_feature_2"] = feature_df["my_input_1"].astype("int32")
        return data

    # Apply the modified odfv
    test_registry.apply_feature_view(odfv1, project)

    # Check odfv
    on_demand_feature_views = test_registry.list_on_demand_feature_views(
        project)

    assert (
        len(on_demand_feature_views) == 1
        and on_demand_feature_views[0].name == "odfv1"
        and on_demand_feature_views[0].features[0].name == "odfv1_my_feature_1"
        and on_demand_feature_views[0].features[0].dtype == Float32
        and on_demand_feature_views[0].features[1].name == "odfv1_my_feature_2"
        and on_demand_feature_views[0].features[1].dtype == Int32)
    request_schema = on_demand_feature_views[0].get_request_data_schema()
    assert (list(request_schema.keys())[0] == "my_input_1"
            and list(request_schema.values())[0] == ValueType.INT32)

    feature_view = test_registry.get_on_demand_feature_view("odfv1", project)
    assert (feature_view.name == "odfv1"
            and feature_view.features[0].name == "odfv1_my_feature_1"
            and feature_view.features[0].dtype == Float32
            and feature_view.features[1].name == "odfv1_my_feature_2"
            and feature_view.features[1].dtype == Int32)
    request_schema = feature_view.get_request_data_schema()
    assert (list(request_schema.keys())[0] == "my_input_1"
            and list(request_schema.values())[0] == ValueType.INT32)

    # Make sure fv1 is untouched
    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
コード例 #14
0
def test_apply_feature_view_success(test_registry):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register Feature View
    test_registry.apply_feature_view(fv1, project)

    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == String
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == Array(String)
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == Array(Bytes)
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.delete_feature_view("my_feature_view_1", project)
    feature_views = test_registry.list_feature_views(project)
    assert len(feature_views) == 0

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
コード例 #15
0
def test_historical_features_from_bigquery_sources_containing_backfills(
        environment):
    store = environment.feature_store

    now = datetime.now().replace(microsecond=0, second=0, minute=0)
    tomorrow = now + timedelta(days=1)
    day_after_tomorrow = now + timedelta(days=2)

    entity_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": day_after_tomorrow
        },
        {
            "driver_id": 1002,
            "event_timestamp": day_after_tomorrow
        },
    ])

    driver_stats_df = pd.DataFrame(data=[
        # Duplicated rows simple case
        {
            "driver_id": 1001,
            "avg_daily_trips": 10,
            "event_timestamp": now,
            "created": now,
        },
        {
            "driver_id": 1001,
            "avg_daily_trips": 20,
            "event_timestamp": now,
            "created": tomorrow,
        },
        # Duplicated rows after a backfill
        {
            "driver_id": 1002,
            "avg_daily_trips": 30,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1002,
            "avg_daily_trips": 40,
            "event_timestamp": tomorrow,
            "created": now,
        },
    ])

    expected_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": day_after_tomorrow,
            "avg_daily_trips": 20,
        },
        {
            "driver_id": 1002,
            "event_timestamp": day_after_tomorrow,
            "avg_daily_trips": 40,
        },
    ])

    driver_stats_data_source = environment.data_source_creator.create_data_source(
        df=driver_stats_df,
        destination_name=
        f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}",
        timestamp_field="event_timestamp",
        created_timestamp_column="created",
    )

    driver = Entity(name="driver",
                    join_keys=["driver_id"],
                    value_type=ValueType.INT64)
    driver_fv = FeatureView(
        name="driver_stats",
        entities=["driver"],
        schema=[Field(name="avg_daily_trips", dtype=Int32)],
        batch_source=driver_stats_data_source,
        ttl=None,
    )

    store.apply([driver, driver_fv])

    offline_job = store.get_historical_features(
        entity_df=entity_df,
        features=["driver_stats:avg_daily_trips"],
        full_feature_names=False,
    )

    start_time = datetime.utcnow()
    actual_df = offline_job.to_df()

    print(f"actual_df shape: {actual_df.shape}")
    end_time = datetime.utcnow()
    print(
        str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
            ))

    assert sorted(expected_df.columns) == sorted(actual_df.columns)
    assert_frame_equal(expected_df, actual_df, keys=["driver_id"])
コード例 #16
0
    def test_bigquery_table_to_datastore_correctness(self):
        # create dataset
        now = datetime.utcnow()
        ts = pd.Timestamp(now).round("ms")
        data = {
            "id": [1, 2, 1, 3, 3],
            "value": [0.1, 0.2, 0.3, 4, 5],
            "ts_1": [
                ts - timedelta(seconds=4),
                ts,
                ts - timedelta(seconds=3),
                ts - timedelta(seconds=4),
                ts - timedelta(seconds=1),
            ],
            "created_ts": [ts, ts, ts, ts, ts],
        }
        df = pd.DataFrame.from_dict(data)

        # load dataset into BigQuery
        job_config = bigquery.LoadJobConfig()
        table_id = f"{self.gcp_project}.{self.bigquery_dataset}.table_correctness_{int(time.time())}"
        job = self.client.load_table_from_dataframe(df,
                                                    table_id,
                                                    job_config=job_config)
        job.result()

        # create FeatureView
        fv = FeatureView(
            name="test_bq_table_correctness",
            entities=["driver_id"],
            features=[Feature("value", ValueType.FLOAT)],
            ttl=timedelta(minutes=5),
            input=BigQuerySource(
                event_timestamp_column="ts",
                table_ref=table_id,
                created_timestamp_column="created_ts",
                field_mapping={
                    "ts_1": "ts",
                    "id": "driver_id"
                },
                date_partition_column="",
            ),
        )
        config = RepoConfig(
            metadata_store="./metadata.db",
            project=f"test_bq_table_correctness_{int(time.time())}",
            provider="gcp",
        )
        fs = FeatureStore(config=config)
        fs.apply([fv])

        # run materialize()
        fs.materialize(
            [fv.name],
            now - timedelta(seconds=5),
            now - timedelta(seconds=2),
        )

        # check result of materialize()
        response_dict = fs.get_online_features([f"{fv.name}:value"],
                                               [{
                                                   "driver_id": 1
                                               }]).to_dict()
        assert abs(response_dict[f"{fv.name}:value"][0] - 0.3) < 1e-6

        # check prior value for materialize_incremental()
        response_dict = fs.get_online_features([f"{fv.name}:value"],
                                               [{
                                                   "driver_id": 3
                                               }]).to_dict()
        assert abs(response_dict[f"{fv.name}:value"][0] - 4) < 1e-6

        # run materialize_incremental()
        fs.materialize_incremental(
            [fv.name],
            now - timedelta(seconds=0),
        )

        # check result of materialize_incremental()
        response_dict = fs.get_online_features([f"{fv.name}:value"],
                                               [{
                                                   "driver_id": 3
                                               }]).to_dict()
        assert abs(response_dict[f"{fv.name}:value"][0] - 5) < 1e-6
コード例 #17
0
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 7),
    schema=[
        Field(name="conv_rate", dtype=Float64),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

input_request = RequestSource(
    name="vals_to_add",
    schema=[
        Field(name="val_to_add", dtype=Int64),
        Field(name="val_to_add_2", dtype=Int64),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
)
コード例 #18
0
def test_historical_features_from_bigquery_sources_containing_backfills(
        capsys):
    now = datetime.now().replace(microsecond=0, second=0, minute=0)
    tomorrow = now + timedelta(days=1)

    entity_dataframe = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": now + timedelta(days=2)
        },
        {
            "driver_id": 1002,
            "event_timestamp": now + timedelta(days=2)
        },
    ])

    driver_stats_df = pd.DataFrame(data=[
        # Duplicated rows simple case
        {
            "driver_id": 1001,
            "avg_daily_trips": 10,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1001,
            "avg_daily_trips": 20,
            "event_timestamp": tomorrow,
            "created": tomorrow,
        },
        # Duplicated rows after a backfill
        {
            "driver_id": 1002,
            "avg_daily_trips": 30,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1002,
            "avg_daily_trips": 40,
            "event_timestamp": tomorrow,
            "created": now,
        },
    ])

    expected_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": now + timedelta(days=2),
            "avg_daily_trips": 20,
        },
        {
            "driver_id": 1002,
            "event_timestamp": now + timedelta(days=2),
            "avg_daily_trips": 40,
        },
    ])

    bigquery_dataset = (
        f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}"
    )

    with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir:
        gcp_project = bigquery.Client().project

        # Entity Dataframe SQL query
        table_id = f"{bigquery_dataset}.orders"
        stage_orders_bigquery(entity_dataframe, table_id)
        entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}"

        # Driver Feature View
        driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly"
        stage_driver_hourly_stats_bigquery_source(driver_stats_df,
                                                  driver_table_id)

        store = FeatureStore(config=RepoConfig(
            registry=os.path.join(temp_dir, "registry.db"),
            project="".join(
                random.choices(string.ascii_uppercase + string.digits, k=10)),
            provider="gcp",
            offline_store=BigQueryOfflineStoreConfig(type="bigquery",
                                                     dataset=bigquery_dataset),
        ))

        driver = Entity(name="driver",
                        join_key="driver_id",
                        value_type=ValueType.INT64)
        driver_fv = FeatureView(
            name="driver_stats",
            entities=["driver"],
            features=[Feature(name="avg_daily_trips", dtype=ValueType.INT32)],
            batch_source=BigQuerySource(
                table_ref=driver_table_id,
                event_timestamp_column="event_timestamp",
                created_timestamp_column="created",
            ),
            ttl=None,
        )

        store.apply([driver, driver_fv])

        try:
            job_from_sql = store.get_historical_features(
                entity_df=entity_df_query,
                features=["driver_stats:avg_daily_trips"],
                full_feature_names=False,
            )

            start_time = datetime.utcnow()
            actual_df_from_sql_entities = job_from_sql.to_df()
            end_time = datetime.utcnow()
            with capsys.disabled():
                print(
                    str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'"
                        ))

            assert sorted(expected_df.columns) == sorted(
                actual_df_from_sql_entities.columns)
            assert_frame_equal(
                expected_df.sort_values(by=["driver_id"]).reset_index(
                    drop=True),
                actual_df_from_sql_entities[expected_df.columns].sort_values(
                    by=["driver_id"]).reset_index(drop=True),
                check_dtype=False,
            )

        finally:
            store.teardown()
コード例 #19
0
    def test_bigquery_ingestion_correctness(self):
        # create dataset
        ts = pd.Timestamp.now(tz="UTC").round("ms")
        checked_value = (
            random.random()
        )  # random value so test doesn't still work if no values written to online store
        data = {
            "id": [1, 2, 1],
            "value": [0.1, 0.2, checked_value],
            "ts_1": [ts - timedelta(minutes=2), ts, ts],
            "created_ts": [ts, ts, ts],
        }
        df = pd.DataFrame.from_dict(data)

        # load dataset into BigQuery
        job_config = bigquery.LoadJobConfig()
        table_id = (
            f"{self.gcp_project}.{self.bigquery_dataset}.correctness_{int(time.time())}"
        )
        job = self.client.load_table_from_dataframe(df,
                                                    table_id,
                                                    job_config=job_config)
        job.result()

        # create FeatureView
        fv = FeatureView(
            name="test_bq_correctness",
            entities=["driver_id"],
            features=[Feature("value", ValueType.FLOAT)],
            ttl=timedelta(minutes=5),
            input=BigQuerySource(
                event_timestamp_column="ts",
                table_ref=table_id,
                created_timestamp_column="created_ts",
                field_mapping={
                    "ts_1": "ts",
                    "id": "driver_id"
                },
                date_partition_column="",
            ),
        )
        config = RepoConfig(
            metadata_store="./metadata.db",
            project="default",
            provider="gcp",
            online_store=OnlineStoreConfig(
                local=LocalOnlineStoreConfig("online_store.db")),
        )
        fs = FeatureStore(config=config)
        fs.apply([fv])

        # run materialize()
        fs.materialize(
            ["test_bq_correctness"],
            datetime.utcnow() - timedelta(minutes=5),
            datetime.utcnow() - timedelta(minutes=0),
        )

        # check result of materialize()
        entity_key = EntityKeyProto(entity_names=["driver_id"],
                                    entity_values=[ValueProto(int64_val=1)])
        t, val = fs._get_provider().online_read("default", fv, entity_key)
        assert abs(val["value"].double_val - checked_value) < 1e-6
コード例 #20
0
    def from_proto(cls,
                   on_demand_feature_view_proto: OnDemandFeatureViewProto):
        """
        Creates an on demand feature view from a protobuf representation.

        Args:
            on_demand_feature_view_proto: A protobuf representation of an on-demand feature view.

        Returns:
            A OnDemandFeatureView object based on the on-demand feature view protobuf.
        """
        sources = []
        for (
                _,
                on_demand_source,
        ) in on_demand_feature_view_proto.spec.sources.items():
            if on_demand_source.WhichOneof("source") == "feature_view":
                sources.append(
                    FeatureView.from_proto(
                        on_demand_source.feature_view).projection)
            elif on_demand_source.WhichOneof(
                    "source") == "feature_view_projection":
                sources.append(
                    FeatureViewProjection.from_proto(
                        on_demand_source.feature_view_projection))
            else:
                sources.append(
                    RequestSource.from_proto(
                        on_demand_source.request_data_source))
        on_demand_feature_view_obj = cls(
            name=on_demand_feature_view_proto.spec.name,
            schema=[
                Field(
                    name=feature.name,
                    dtype=from_value_type(ValueType(feature.value_type)),
                ) for feature in on_demand_feature_view_proto.spec.features
            ],
            sources=sources,
            udf=dill.loads(
                on_demand_feature_view_proto.spec.user_defined_function.body),
            description=on_demand_feature_view_proto.spec.description,
            tags=dict(on_demand_feature_view_proto.spec.tags),
            owner=on_demand_feature_view_proto.spec.owner,
        )

        # FeatureViewProjections are not saved in the OnDemandFeatureView proto.
        # Create the default projection.
        on_demand_feature_view_obj.projection = FeatureViewProjection.from_definition(
            on_demand_feature_view_obj)

        if on_demand_feature_view_proto.meta.HasField("created_timestamp"):
            on_demand_feature_view_obj.created_timestamp = (
                on_demand_feature_view_proto.meta.created_timestamp.ToDatetime(
                ))
        if on_demand_feature_view_proto.meta.HasField(
                "last_updated_timestamp"):
            on_demand_feature_view_obj.last_updated_timestamp = (
                on_demand_feature_view_proto.meta.last_updated_timestamp.
                ToDatetime())

        return on_demand_feature_view_obj
コード例 #21
0
ファイル: test_registry.py プロジェクト: qooba/feast
def test_apply_feature_view_integration(test_registry):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register Feature View
    test_registry.apply_feature_view(fv1, project)

    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == ValueType.INT64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == ValueType.STRING
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == ValueType.STRING_LIST
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == ValueType.INT64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == ValueType.STRING
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == ValueType.STRING_LIST
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == ValueType.BYTES_LIST
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.delete_feature_view("my_feature_view_1", project)
    feature_views = test_registry.list_feature_views(project)
    assert len(feature_views) == 0

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
コード例 #22
0
def test_hash():
    file_source = FileSource(name="my-file-source", path="test.parquet")
    feature_view = FeatureView(
        name="my-feature-view",
        entities=[],
        schema=[
            Field(name="feature1", dtype=Float32),
            Field(name="feature2", dtype=Float32),
        ],
        source=file_source,
    )
    sources = [feature_view]
    on_demand_feature_view_1 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf1,
    )
    on_demand_feature_view_2 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf1,
    )
    on_demand_feature_view_3 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf2,
    )
    on_demand_feature_view_4 = OnDemandFeatureView(
        name="my-on-demand-feature-view",
        sources=sources,
        schema=[
            Field(name="output1", dtype=Float32),
            Field(name="output2", dtype=Float32),
        ],
        udf=udf2,
        description="test",
    )

    s1 = {on_demand_feature_view_1, on_demand_feature_view_2}
    assert len(s1) == 1

    s2 = {on_demand_feature_view_1, on_demand_feature_view_3}
    assert len(s2) == 2

    s3 = {on_demand_feature_view_3, on_demand_feature_view_4}
    assert len(s3) == 2

    s4 = {
        on_demand_feature_view_1,
        on_demand_feature_view_2,
        on_demand_feature_view_3,
        on_demand_feature_view_4,
    }
    assert len(s4) == 3