Exemple #1
0
def get_feature_view_query_context(
    feature_refs: List[str],
    feature_views: List[FeatureView],
    registry: Registry,
    project: str,
) -> List[FeatureViewQueryContext]:
    """Build a query context containing all information required to template a BigQuery and Redshift point-in-time SQL query"""

    (
        feature_views_to_feature_map,
        on_demand_feature_views_to_features,
    ) = _get_requested_feature_views_to_features_dict(
        feature_refs, feature_views, registry.list_on_demand_feature_views(project)
    )

    query_context = []
    for feature_view, features in feature_views_to_feature_map.items():
        join_keys = []
        entity_selections = []
        reverse_field_mapping = {
            v: k for k, v in feature_view.input.field_mapping.items()
        }
        for entity_name in feature_view.entities:
            entity = registry.get_entity(entity_name, project)
            join_key = feature_view.projection.join_key_map.get(
                entity.join_key, entity.join_key
            )
            join_keys.append(join_key)
            entity_selections.append(f"{entity.join_key} AS {join_key}")

        if isinstance(feature_view.ttl, timedelta):
            ttl_seconds = int(feature_view.ttl.total_seconds())
        else:
            ttl_seconds = 0

        event_timestamp_column = feature_view.input.event_timestamp_column
        created_timestamp_column = feature_view.input.created_timestamp_column

        context = FeatureViewQueryContext(
            name=feature_view.projection.name_to_use(),
            ttl=ttl_seconds,
            entities=join_keys,
            features=features,
            event_timestamp_column=reverse_field_mapping.get(
                event_timestamp_column, event_timestamp_column
            ),
            created_timestamp_column=reverse_field_mapping.get(
                created_timestamp_column, created_timestamp_column
            ),
            # TODO: Make created column optional and not hardcoded
            table_subquery=feature_view.input.get_table_query_string(),
            entity_selections=entity_selections,
        )
        query_context.append(context)
    return query_context
Exemple #2
0
def _tag_registry_on_demand_feature_views_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[List[OnDemandFeatureView], List[OnDemandFeatureView]]:
    odfvs_to_keep: List[OnDemandFeatureView] = repo.on_demand_feature_views
    odfvs_to_delete: List[OnDemandFeatureView] = []
    repo_on_demand_feature_view_names = set(
        t.name for t in repo.on_demand_feature_views
    )
    for registry_odfv in registry.list_on_demand_feature_views(project=project):
        if registry_odfv.name not in repo_on_demand_feature_view_names:
            odfvs_to_delete.append(registry_odfv)
    return odfvs_to_keep, odfvs_to_delete
Exemple #3
0
    def get_historical_features(
        config: RepoConfig,
        feature_views: List[FeatureView],
        feature_refs: List[str],
        entity_df: Union[pd.DataFrame, str],
        registry: Registry,
        project: str,
        full_feature_names: bool = False,
    ) -> RetrievalJob:
        if not isinstance(entity_df, pd.DataFrame):
            raise ValueError(
                f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}"
            )
        entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL  # local modifiable copy of global variable
        if entity_df_event_timestamp_col not in entity_df.columns:
            datetime_columns = entity_df.select_dtypes(
                include=["datetime", "datetimetz"]).columns
            if len(datetime_columns) == 1:
                print(
                    f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}."
                )
                entity_df_event_timestamp_col = datetime_columns[0]
            else:
                raise ValueError(
                    f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events."
                )
        (
            feature_views_to_features,
            on_demand_feature_views_to_features,
        ) = _get_requested_feature_views_to_features_dict(
            feature_refs,
            feature_views,
            registry.list_on_demand_feature_views(config.project),
        )

        # Create lazy function that is only called from the RetrievalJob object
        def evaluate_historical_retrieval():

            # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC
            entity_df[entity_df_event_timestamp_col] = entity_df[
                entity_df_event_timestamp_col].apply(
                    lambda x: x
                    if x.tzinfo is not None else x.replace(tzinfo=pytz.utc))

            # Create a copy of entity_df to prevent modifying the original
            entity_df_with_features = entity_df.copy()

            # Convert event timestamp column to datetime and normalize time zone to UTC
            # This is necessary to avoid issues with pd.merge_asof
            entity_df_with_features[
                entity_df_event_timestamp_col] = pd.to_datetime(
                    entity_df_with_features[entity_df_event_timestamp_col],
                    utc=True)

            # Sort event timestamp values
            entity_df_with_features = entity_df_with_features.sort_values(
                entity_df_event_timestamp_col)

            # Load feature view data from sources and join them incrementally
            for feature_view, features in feature_views_to_features.items():
                event_timestamp_column = (
                    feature_view.batch_source.event_timestamp_column)
                created_timestamp_column = (
                    feature_view.batch_source.created_timestamp_column)

                # Read offline parquet data in pyarrow format.
                filesystem, path = FileSource.create_filesystem_and_path(
                    feature_view.batch_source.path,
                    feature_view.batch_source.file_options.
                    s3_endpoint_override,
                )
                table = pyarrow.parquet.read_table(path, filesystem=filesystem)

                # Rename columns by the field mapping dictionary if it exists
                if feature_view.batch_source.field_mapping is not None:
                    table = _run_field_mapping(
                        table, feature_view.batch_source.field_mapping)
                # Rename entity columns by the join_key_map dictionary if it exists
                if feature_view.projection.join_key_map:
                    table = _run_field_mapping(
                        table, feature_view.projection.join_key_map)

                # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values,
                # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean
                # If the dtype is 'object', then missing values are inferred as python `None`s.
                # More details at:
                # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
                df_to_join = table.to_pandas()

                # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC
                df_to_join[event_timestamp_column] = df_to_join[
                    event_timestamp_column].apply(lambda x: x
                                                  if x.tzinfo is not None else
                                                  x.replace(tzinfo=pytz.utc))
                if created_timestamp_column:
                    df_to_join[created_timestamp_column] = df_to_join[
                        created_timestamp_column].apply(
                            lambda x: x if x.tzinfo is not None else x.replace(
                                tzinfo=pytz.utc))

                # Sort dataframe by the event timestamp column
                df_to_join = df_to_join.sort_values(event_timestamp_column)

                # Build a list of all the features we should select from this source
                feature_names = []
                for feature in features:
                    # Modify the separator for feature refs in column names to double underscore. We are using
                    # double underscore as separator for consistency with other databases like BigQuery,
                    # where there are very few characters available for use as separators
                    if full_feature_names:
                        formatted_feature_name = (
                            f"{feature_view.projection.name_to_use()}__{feature}"
                        )
                    else:
                        formatted_feature_name = feature
                    # Add the feature name to the list of columns
                    feature_names.append(formatted_feature_name)

                    # Ensure that the source dataframe feature column includes the feature view name as a prefix
                    df_to_join.rename(
                        columns={feature: formatted_feature_name},
                        inplace=True,
                    )

                # Build a list of entity columns to join on (from the right table)
                join_keys = []
                for entity_name in feature_view.entities:
                    entity = registry.get_entity(entity_name, project)
                    join_key = feature_view.projection.join_key_map.get(
                        entity.join_key, entity.join_key)
                    join_keys.append(join_key)
                right_entity_columns = join_keys
                right_entity_key_columns = [event_timestamp_column
                                            ] + right_entity_columns

                # Remove all duplicate entity keys (using created timestamp)
                right_entity_key_sort_columns = right_entity_key_columns
                if created_timestamp_column:
                    # If created_timestamp is available, use it to dedupe deterministically
                    right_entity_key_sort_columns = right_entity_key_sort_columns + [
                        created_timestamp_column
                    ]

                df_to_join.sort_values(by=right_entity_key_sort_columns,
                                       inplace=True)
                df_to_join.drop_duplicates(
                    right_entity_key_sort_columns,
                    keep="last",
                    ignore_index=True,
                    inplace=True,
                )

                # Select only the columns we need to join from the feature dataframe
                df_to_join = df_to_join[right_entity_key_columns +
                                        feature_names]

                # Do point in-time-join between entity_df and feature dataframe
                entity_df_with_features = pd.merge_asof(
                    entity_df_with_features,
                    df_to_join,
                    left_on=entity_df_event_timestamp_col,
                    right_on=event_timestamp_column,
                    by=right_entity_columns or None,
                    tolerance=feature_view.ttl,
                )

                # Remove right (feature table/view) event_timestamp column.
                if event_timestamp_column != entity_df_event_timestamp_col:
                    entity_df_with_features.drop(
                        columns=[event_timestamp_column], inplace=True)

                # Ensure that we delete dataframes to free up memory
                del df_to_join

            # Move "event_timestamp" column to front
            current_cols = entity_df_with_features.columns.tolist()
            current_cols.remove(entity_df_event_timestamp_col)
            entity_df_with_features = entity_df_with_features[
                [entity_df_event_timestamp_col] + current_cols]

            return entity_df_with_features

        job = FileRetrievalJob(
            evaluation_function=evaluate_historical_retrieval,
            full_feature_names=full_feature_names,
            on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs(
                feature_refs, project, registry),
        )
        return job
Exemple #4
0
class FeatureStore:
    """
    A FeatureStore object is used to define, create, and retrieve features.

    Args:
        repo_path (optional): Path to a `feature_store.yaml` used to configure the
            feature store.
        config (optional): Configuration object used to configure the feature store.
    """

    config: RepoConfig
    repo_path: Path
    _registry: Registry

    @log_exceptions
    def __init__(
        self,
        repo_path: Optional[str] = None,
        config: Optional[RepoConfig] = None,
    ):
        """
        Creates a FeatureStore object.

        Raises:
            ValueError: If both or neither of repo_path and config are specified.
        """
        if repo_path is not None and config is not None:
            raise ValueError("You cannot specify both repo_path and config.")
        if config is not None:
            self.repo_path = Path(os.getcwd())
            self.config = config
        elif repo_path is not None:
            self.repo_path = Path(repo_path)
            self.config = load_repo_config(Path(repo_path))
        else:
            raise ValueError("Please specify one of repo_path or config.")

        registry_config = self.config.get_registry_config()
        self._registry = Registry(registry_config, repo_path=self.repo_path)

    @log_exceptions
    def version(self) -> str:
        """Returns the version of the current Feast SDK/CLI."""
        return get_version()

    @property
    def registry(self) -> Registry:
        """Gets the registry of this feature store."""
        return self._registry

    @property
    def project(self) -> str:
        """Gets the project of this feature store."""
        return self.config.project

    def _get_provider(self) -> Provider:
        # TODO: Bake self.repo_path into self.config so that we dont only have one interface to paths
        return get_provider(self.config, self.repo_path)

    @log_exceptions_and_usage
    def refresh_registry(self):
        """Fetches and caches a copy of the feature registry in memory.

        Explicitly calling this method allows for direct control of the state of the registry cache. Every time this
        method is called the complete registry state will be retrieved from the remote registry store backend
        (e.g., GCS, S3), and the cache timer will be reset. If refresh_registry() is run before get_online_features()
        is called, then get_online_feature() will use the cached registry instead of retrieving (and caching) the
        registry itself.

        Additionally, the TTL for the registry cache can be set to infinity (by setting it to 0), which means that
        refresh_registry() will become the only way to update the cached registry. If the TTL is set to a value
        greater than 0, then once the cache becomes stale (more time than the TTL has passed), a new cache will be
        downloaded synchronously, which may increase latencies if the triggering method is get_online_features()
        """
        registry_config = self.config.get_registry_config()
        self._registry = Registry(registry_config, repo_path=self.repo_path)
        self._registry.refresh()

    @log_exceptions_and_usage
    def list_entities(self, allow_cache: bool = False) -> List[Entity]:
        """
        Retrieves the list of entities from the registry.

        Args:
            allow_cache: Whether to allow returning entities from a cached registry.

        Returns:
            A list of entities.
        """
        return self._list_entities(allow_cache)

    def _list_entities(self,
                       allow_cache: bool = False,
                       hide_dummy_entity: bool = True) -> List[Entity]:
        all_entities = self._registry.list_entities(self.project,
                                                    allow_cache=allow_cache)
        return [
            entity for entity in all_entities
            if entity.name != DUMMY_ENTITY_NAME or not hide_dummy_entity
        ]

    @log_exceptions_and_usage
    def list_feature_services(self) -> List[FeatureService]:
        """
        Retrieves the list of feature services from the registry.

        Returns:
            A list of feature services.
        """
        return self._registry.list_feature_services(self.project)

    @log_exceptions_and_usage
    def list_feature_views(self,
                           allow_cache: bool = False) -> List[FeatureView]:
        """
        Retrieves the list of feature views from the registry.

        Args:
            allow_cache: Whether to allow returning entities from a cached registry.

        Returns:
            A list of feature views.
        """
        return self._list_feature_views(allow_cache)

    def _list_feature_views(
            self,
            allow_cache: bool = False,
            hide_dummy_entity: bool = True) -> List[FeatureView]:
        feature_views = []
        for fv in self._registry.list_feature_views(self.project,
                                                    allow_cache=allow_cache):
            if hide_dummy_entity and fv.entities[0] == DUMMY_ENTITY_NAME:
                fv.entities = []
            feature_views.append(fv)
        return feature_views

    @log_exceptions_and_usage
    def list_on_demand_feature_views(self) -> List[OnDemandFeatureView]:
        """
        Retrieves the list of on demand feature views from the registry.

        Returns:
            A list of on demand feature views.
        """
        return self._registry.list_on_demand_feature_views(self.project)

    @log_exceptions_and_usage
    def get_entity(self, name: str) -> Entity:
        """
        Retrieves an entity.

        Args:
            name: Name of entity.

        Returns:
            The specified entity.

        Raises:
            EntityNotFoundException: The entity could not be found.
        """
        return self._registry.get_entity(name, self.project)

    @log_exceptions_and_usage
    def get_feature_service(self, name: str) -> FeatureService:
        """
        Retrieves a feature service.

        Args:
            name: Name of feature service.

        Returns:
            The specified feature service.

        Raises:
            FeatureServiceNotFoundException: The feature service could not be found.
        """
        return self._registry.get_feature_service(name, self.project)

    @log_exceptions_and_usage
    def get_feature_view(self, name: str) -> FeatureView:
        """
        Retrieves a feature view.

        Args:
            name: Name of feature view.

        Returns:
            The specified feature view.

        Raises:
            FeatureViewNotFoundException: The feature view could not be found.
        """
        return self._get_feature_view(name)

    def _get_feature_view(self,
                          name: str,
                          hide_dummy_entity: bool = True) -> FeatureView:
        feature_view = self._registry.get_feature_view(name, self.project)
        if hide_dummy_entity and feature_view.entities[0] == DUMMY_ENTITY_NAME:
            feature_view.entities = []
        return feature_view

    @log_exceptions_and_usage
    def get_on_demand_feature_view(self, name: str) -> OnDemandFeatureView:
        """
        Retrieves a feature view.

        Args:
            name: Name of feature view.

        Returns:
            The specified feature view.

        Raises:
            FeatureViewNotFoundException: The feature view could not be found.
        """
        return self._registry.get_on_demand_feature_view(name, self.project)

    @log_exceptions_and_usage
    def delete_feature_view(self, name: str):
        """
        Deletes a feature view.

        Args:
            name: Name of feature view.

        Raises:
            FeatureViewNotFoundException: The feature view could not be found.
        """
        return self._registry.delete_feature_view(name, self.project)

    @log_exceptions_and_usage
    def delete_feature_service(self, name: str):
        """
            Deletes a feature service.

            Args:
                name: Name of feature service.

            Raises:
                FeatureServiceNotFoundException: The feature view could not be found.
            """
        return self._registry.delete_feature_service(name, self.project)

    def _get_features(
        self,
        features: Optional[Union[List[str], FeatureService]],
        feature_refs: Optional[List[str]],
    ) -> List[str]:
        _features = features or feature_refs
        if not _features:
            raise ValueError("No features specified for retrieval")

        _feature_refs: List[str]
        if isinstance(_features, FeatureService):
            # Get the latest value of the feature service, in case the object passed in has been updated underneath us.
            _feature_refs = _get_feature_refs_from_feature_services(
                self.get_feature_service(_features.name))
        else:
            _feature_refs = _features
        return _feature_refs

    @log_exceptions_and_usage
    def apply(
        self,
        objects: Union[Entity, FeatureView, OnDemandFeatureView,
                       FeatureService, List[Union[FeatureView,
                                                  OnDemandFeatureView, Entity,
                                                  FeatureService]], ],
        commit: bool = True,
    ):
        """Register objects to metadata store and update related infrastructure.

        The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these
        objects in the Feast registry. Once the registry has been updated, the apply method will update related
        infrastructure (e.g., create tables in an online store) in order to reflect these new definitions. All
        operations are idempotent, meaning they can safely be rerun.

        Args:
            objects: A single object, or a list of objects that should be registered with the Feature Store.
            commit: whether to commit changes to the registry

        Raises:
            ValueError: The 'objects' parameter could not be parsed properly.

        Examples:
            Register an Entity and a FeatureView.

            >>> from feast import FeatureStore, Entity, FeatureView, Feature, ValueType, FileSource, RepoConfig
            >>> from datetime import timedelta
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id")
            >>> driver_hourly_stats = FileSource(
            ...     path="feature_repo/data/driver_stats.parquet",
            ...     event_timestamp_column="event_timestamp",
            ...     created_timestamp_column="created",
            ... )
            >>> driver_hourly_stats_view = FeatureView(
            ...     name="driver_hourly_stats",
            ...     entities=["driver_id"],
            ...     ttl=timedelta(seconds=86400 * 1),
            ...     batch_source=driver_hourly_stats,
            ... )
            >>> fs.apply([driver_hourly_stats_view, driver]) # register entity and feature view
        """
        # TODO: Add locking

        if not isinstance(objects, Iterable):
            objects = [objects]

        assert isinstance(objects, list)

        views_to_update = [ob for ob in objects if isinstance(ob, FeatureView)]
        odfvs_to_update = [
            ob for ob in objects if isinstance(ob, OnDemandFeatureView)
        ]
        if (not flags_helper.enable_on_demand_feature_views(self.config)
                and len(odfvs_to_update) > 0):
            raise ExperimentalFeatureNotEnabled(
                flags.FLAG_ON_DEMAND_TRANSFORM_NAME)

        if len(odfvs_to_update) > 0:
            log_event(UsageEvent.APPLY_WITH_ODFV)

        _validate_feature_views(views_to_update)
        entities_to_update = [ob for ob in objects if isinstance(ob, Entity)]
        services_to_update = [
            ob for ob in objects if isinstance(ob, FeatureService)
        ]

        # Make inferences
        update_entities_with_inferred_types_from_feature_views(
            entities_to_update, views_to_update, self.config)

        update_data_sources_with_inferred_event_timestamp_col(
            [view.batch_source for view in views_to_update], self.config)

        for view in views_to_update:
            view.infer_features_from_batch_source(self.config)

        for odfv in odfvs_to_update:
            odfv.infer_features()

        if len(views_to_update) + len(entities_to_update) + len(
                services_to_update) + len(odfvs_to_update) != len(objects):
            raise ValueError(
                "Unknown object type provided as part of apply() call")

        # DUMMY_ENTITY is a placeholder entity used in entityless FeatureViews
        DUMMY_ENTITY = Entity(
            name=DUMMY_ENTITY_NAME,
            join_key=DUMMY_ENTITY_ID,
            value_type=ValueType.INT32,
        )
        entities_to_update.append(DUMMY_ENTITY)

        for view in views_to_update:
            self._registry.apply_feature_view(view,
                                              project=self.project,
                                              commit=False)
        for odfv in odfvs_to_update:
            self._registry.apply_on_demand_feature_view(odfv,
                                                        project=self.project,
                                                        commit=False)
        for ent in entities_to_update:
            self._registry.apply_entity(ent,
                                        project=self.project,
                                        commit=False)
        for feature_service in services_to_update:
            self._registry.apply_feature_service(feature_service,
                                                 project=self.project)

        self._get_provider().update_infra(
            project=self.project,
            tables_to_delete=[],
            tables_to_keep=views_to_update,
            entities_to_delete=[],
            entities_to_keep=entities_to_update,
            partial=True,
        )

        if commit:
            self._registry.commit()

    @log_exceptions_and_usage
    def teardown(self):
        """Tears down all local and cloud resources for the feature store."""
        tables: List[Union[FeatureView, FeatureTable]] = []
        feature_views = self.list_feature_views()
        feature_tables = self._registry.list_feature_tables(self.project)

        tables.extend(feature_views)
        tables.extend(feature_tables)

        entities = self.list_entities()

        self._get_provider().teardown_infra(self.project, tables, entities)
        self._registry.teardown()

    @log_exceptions_and_usage
    def get_historical_features(
        self,
        entity_df: Union[pd.DataFrame, str],
        features: Optional[Union[List[str], FeatureService]] = None,
        feature_refs: Optional[List[str]] = None,
        full_feature_names: bool = False,
    ) -> RetrievalJob:
        """Enrich an entity dataframe with historical feature values for either training or batch scoring.

        This method joins historical feature data from one or more feature views to an entity dataframe by using a time
        travel join.

        Each feature view is joined to the entity dataframe using all entities configured for the respective feature
        view. All configured entities must be available in the entity dataframe. Therefore, the entity dataframe must
        contain all entities found in all feature views, but the individual feature views can have different entities.

        Time travel is based on the configured TTL for each feature view. A shorter TTL will limit the
        amount of scanning that will be done in order to find feature data for a specific entity key. Setting a short
        TTL may result in null values being returned.

        Args:
            entity_df (Union[pd.DataFrame, str]): An entity dataframe is a collection of rows containing all entity
                columns (e.g., customer_id, driver_id) on which features need to be joined, as well as a event_timestamp
                column used to ensure point-in-time correctness. Either a Pandas DataFrame can be provided or a string
                SQL query. The query must be of a format supported by the configured offline store (e.g., BigQuery)
            features: A list of features, that should be retrieved from the offline store.
                Either a list of string feature references can be provided or a FeatureService object.
                Feature references are of the format "feature_view:feature", e.g., "customer_fv:daily_transactions".
            full_feature_names: A boolean that provides the option to add the feature view prefixes to the feature names,
                changing them from the format "feature" to "feature_view__feature" (e.g., "daily_transactions" changes to
                "customer_fv__daily_transactions"). By default, this value is set to False.

        Returns:
            RetrievalJob which can be used to materialize the results.

        Raises:
            ValueError: Both or neither of features and feature_refs are specified.

        Examples:
            Retrieve historical features from a local offline store.

            >>> from feast import FeatureStore, RepoConfig
            >>> import pandas as pd
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> entity_df = pd.DataFrame.from_dict(
            ...     {
            ...         "driver_id": [1001, 1002],
            ...         "event_timestamp": [
            ...             datetime(2021, 4, 12, 10, 59, 42),
            ...             datetime(2021, 4, 12, 8, 12, 10),
            ...         ],
            ...     }
            ... )
            >>> retrieval_job = fs.get_historical_features(
            ...     entity_df=entity_df,
            ...     features=[
            ...         "driver_hourly_stats:conv_rate",
            ...         "driver_hourly_stats:acc_rate",
            ...         "driver_hourly_stats:avg_daily_trips",
            ...     ],
            ... )
            >>> feature_data = retrieval_job.to_df()
        """
        if (features is not None
                and feature_refs is not None) or (features is None
                                                  and feature_refs is None):
            raise ValueError(
                "You must specify exactly one of features and feature_refs.")

        if feature_refs:
            warnings.warn(
                ("The argument 'feature_refs' is being deprecated. Please use 'features' "
                 "instead. Feast 0.13 and onwards will not support the argument 'feature_refs'."
                 ),
                DeprecationWarning,
            )

        _feature_refs = self._get_features(features, feature_refs)

        all_feature_views = self.list_feature_views()
        all_on_demand_feature_views = self._registry.list_on_demand_feature_views(
            project=self.project)

        # TODO(achal): _group_feature_refs returns the on demand feature views, but it's no passed into the provider.
        # This is a weird interface quirk - we should revisit the `get_historical_features` to
        # pass in the on demand feature views as well.
        fvs, odfvs = _group_feature_refs(_feature_refs, all_feature_views,
                                         all_on_demand_feature_views)
        feature_views = list(view for view, _ in fvs)
        on_demand_feature_views = list(view for view, _ in odfvs)
        if len(on_demand_feature_views) > 0:
            log_event(UsageEvent.GET_HISTORICAL_FEATURES_WITH_ODFV)

        # Check that the right request data is present in the entity_df
        if type(entity_df) == pd.DataFrame:
            entity_pd_df = cast(pd.DataFrame, entity_df)
            for odfv in on_demand_feature_views:
                odfv_inputs = odfv.inputs.values()
                for odfv_input in odfv_inputs:
                    if type(odfv_input) == RequestDataSource:
                        request_data_source = cast(RequestDataSource,
                                                   odfv_input)
                        for feature_name in request_data_source.schema.keys():
                            if feature_name not in entity_pd_df.columns:
                                raise RequestDataNotFoundInEntityDfException(
                                    feature_name=feature_name,
                                    feature_view_name=odfv.name,
                                )

        _validate_feature_refs(_feature_refs, full_feature_names)

        provider = self._get_provider()

        job = provider.get_historical_features(
            self.config,
            feature_views,
            _feature_refs,
            entity_df,
            self._registry,
            self.project,
            full_feature_names,
        )

        return job

    @log_exceptions_and_usage
    def materialize_incremental(
        self,
        end_date: datetime,
        feature_views: Optional[List[str]] = None,
    ) -> None:
        """
        Materialize incremental new data from the offline store into the online store.

        This method loads incremental new feature data up to the specified end time from either
        the specified feature views, or all feature views if none are specified,
        into the online store where it is available for online serving. The start time of
        the interval materialized is either the most recent end time of a prior materialization or
        (now - ttl) if no such prior materialization exists.

        Args:
            end_date (datetime): End date for time range of data to materialize into the online store
            feature_views (List[str]): Optional list of feature view names. If selected, will only run
                materialization for the specified feature views.

        Raises:
            Exception: A feature view being materialized does not have a TTL set.

        Examples:
            Materialize all features into the online store up to 5 minutes ago.

            >>> from feast import FeatureStore, RepoConfig
            >>> from datetime import datetime, timedelta
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> fs.materialize_incremental(end_date=datetime.utcnow() - timedelta(minutes=5))
            Materializing...
            <BLANKLINE>
            ...
        """
        feature_views_to_materialize = []
        if feature_views is None:
            feature_views_to_materialize = self._list_feature_views(
                hide_dummy_entity=False)
        else:
            for name in feature_views:
                feature_view = self._get_feature_view(name,
                                                      hide_dummy_entity=False)
                feature_views_to_materialize.append(feature_view)

        _print_materialization_log(
            None,
            end_date,
            len(feature_views_to_materialize),
            self.config.online_store.type,
        )
        # TODO paging large loads
        for feature_view in feature_views_to_materialize:
            start_date = feature_view.most_recent_end_time
            if start_date is None:
                if feature_view.ttl is None:
                    raise Exception(
                        f"No start time found for feature view {feature_view.name}. materialize_incremental() requires"
                        f" either a ttl to be set or for materialize() to have been run at least once."
                    )
                start_date = datetime.utcnow() - feature_view.ttl
            provider = self._get_provider()
            print(
                f"{Style.BRIGHT + Fore.GREEN}{feature_view.name}{Style.RESET_ALL}"
                f" from {Style.BRIGHT + Fore.GREEN}{start_date.replace(microsecond=0).astimezone()}{Style.RESET_ALL}"
                f" to {Style.BRIGHT + Fore.GREEN}{end_date.replace(microsecond=0).astimezone()}{Style.RESET_ALL}:"
            )

            def tqdm_builder(length):
                return tqdm(total=length, ncols=100)

            start_date = utils.make_tzaware(start_date)
            end_date = utils.make_tzaware(end_date)

            provider.materialize_single_feature_view(
                config=self.config,
                feature_view=feature_view,
                start_date=start_date,
                end_date=end_date,
                registry=self._registry,
                project=self.project,
                tqdm_builder=tqdm_builder,
            )

            self._registry.apply_materialization(feature_view, self.project,
                                                 start_date, end_date)

    @log_exceptions_and_usage
    def materialize(
        self,
        start_date: datetime,
        end_date: datetime,
        feature_views: Optional[List[str]] = None,
    ) -> None:
        """
        Materialize data from the offline store into the online store.

        This method loads feature data in the specified interval from either
        the specified feature views, or all feature views if none are specified,
        into the online store where it is available for online serving.

        Args:
            start_date (datetime): Start date for time range of data to materialize into the online store
            end_date (datetime): End date for time range of data to materialize into the online store
            feature_views (List[str]): Optional list of feature view names. If selected, will only run
                materialization for the specified feature views.

        Examples:
            Materialize all features into the online store over the interval
            from 3 hours ago to 10 minutes ago.

            >>> from feast import FeatureStore, RepoConfig
            >>> from datetime import datetime, timedelta
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> fs.materialize(
            ...     start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10)
            ... )
            Materializing...
            <BLANKLINE>
            ...
        """
        if utils.make_tzaware(start_date) > utils.make_tzaware(end_date):
            raise ValueError(
                f"The given start_date {start_date} is greater than the given end_date {end_date}."
            )

        feature_views_to_materialize = []
        if feature_views is None:
            feature_views_to_materialize = self._list_feature_views(
                hide_dummy_entity=False)
        else:
            for name in feature_views:
                feature_view = self._get_feature_view(name,
                                                      hide_dummy_entity=False)
                feature_views_to_materialize.append(feature_view)

        _print_materialization_log(
            start_date,
            end_date,
            len(feature_views_to_materialize),
            self.config.online_store.type,
        )
        # TODO paging large loads
        for feature_view in feature_views_to_materialize:
            provider = self._get_provider()
            print(
                f"{Style.BRIGHT + Fore.GREEN}{feature_view.name}{Style.RESET_ALL}:"
            )

            def tqdm_builder(length):
                return tqdm(total=length, ncols=100)

            start_date = utils.make_tzaware(start_date)
            end_date = utils.make_tzaware(end_date)

            provider.materialize_single_feature_view(
                config=self.config,
                feature_view=feature_view,
                start_date=start_date,
                end_date=end_date,
                registry=self._registry,
                project=self.project,
                tqdm_builder=tqdm_builder,
            )

            self._registry.apply_materialization(feature_view, self.project,
                                                 start_date, end_date)

    @log_exceptions_and_usage
    def get_online_features(
        self,
        features: Union[List[str], FeatureService],
        entity_rows: List[Dict[str, Any]],
        feature_refs: Optional[List[str]] = None,
        full_feature_names: bool = False,
    ) -> OnlineResponse:
        """
        Retrieves the latest online feature data.

        Note: This method will download the full feature registry the first time it is run. If you are using a
        remote registry like GCS or S3 then that may take a few seconds. The registry remains cached up to a TTL
        duration (which can be set to infinity). If the cached registry is stale (more time than the TTL has
        passed), then a new registry will be downloaded synchronously by this method. This download may
        introduce latency to online feature retrieval. In order to avoid synchronous downloads, please call
        refresh_registry() prior to the TTL being reached. Remember it is possible to set the cache TTL to
        infinity (cache forever).

        Args:
            features: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
                Only the feature name is required.
            entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair.

        Returns:
            OnlineResponse containing the feature data in records.

        Raises:
            Exception: No entity with the specified name exists.

        Examples:
            Materialize all features into the online store over the interval
            from 3 hours ago to 10 minutes ago, and then retrieve these online features.

            >>> from feast import FeatureStore, RepoConfig
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> online_response = fs.get_online_features(
            ...     features=[
            ...         "driver_hourly_stats:conv_rate",
            ...         "driver_hourly_stats:acc_rate",
            ...         "driver_hourly_stats:avg_daily_trips",
            ...     ],
            ...     entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}, {"driver_id": 1003}, {"driver_id": 1004}],
            ... )
            >>> online_response_dict = online_response.to_dict()
        """
        _feature_refs = self._get_features(features, feature_refs)
        all_feature_views = self._list_feature_views(allow_cache=True,
                                                     hide_dummy_entity=False)
        all_on_demand_feature_views = self._registry.list_on_demand_feature_views(
            project=self.project, allow_cache=True)

        _validate_feature_refs(_feature_refs, full_feature_names)
        grouped_refs, grouped_odfv_refs = _group_feature_refs(
            _feature_refs, all_feature_views, all_on_demand_feature_views)
        if len(grouped_odfv_refs) > 0:
            log_event(UsageEvent.GET_ONLINE_FEATURES_WITH_ODFV)

        feature_views = list(view for view, _ in grouped_refs)
        entityless_case = DUMMY_ENTITY_NAME in [
            entity_name for feature_view in feature_views
            for entity_name in feature_view.entities
        ]

        provider = self._get_provider()
        entities = self._list_entities(allow_cache=True,
                                       hide_dummy_entity=False)
        entity_name_to_join_key_map = {}
        for entity in entities:
            entity_name_to_join_key_map[entity.name] = entity.join_key

        needed_request_data_features = self._get_needed_request_data_features(
            grouped_odfv_refs)

        join_key_rows = []
        request_data_features: Dict[str, List[Any]] = {}
        # Entity rows may be either entities or request data.
        for row in entity_rows:
            join_key_row = {}
            for entity_name, entity_value in row.items():
                # Found request data
                if entity_name in needed_request_data_features:
                    if entity_name not in request_data_features:
                        request_data_features[entity_name] = []
                    request_data_features[entity_name].append(entity_value)
                    continue
                try:
                    join_key = entity_name_to_join_key_map[entity_name]
                except KeyError:
                    raise EntityNotFoundException(entity_name, self.project)
                join_key_row[join_key] = entity_value
                if entityless_case:
                    join_key_row[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL
            if len(join_key_row) > 0:
                # May be empty if this entity row was request data
                join_key_rows.append(join_key_row)

        if len(needed_request_data_features) != len(
                request_data_features.keys()):
            raise RequestDataNotFoundInEntityRowsException(
                feature_names=needed_request_data_features)

        entity_row_proto_list = _infer_online_entity_rows(join_key_rows)

        union_of_entity_keys: List[EntityKeyProto] = []
        result_rows: List[GetOnlineFeaturesResponse.FieldValues] = []

        for entity_row_proto in entity_row_proto_list:
            # Create a list of entity keys to filter down for each feature view at lookup time.
            union_of_entity_keys.append(_entity_row_to_key(entity_row_proto))
            # Also create entity values to append to the result
            result_rows.append(_entity_row_to_field_values(entity_row_proto))

        # Add more feature values to the existing result rows for the request data features
        for feature_name, feature_values in request_data_features.items():
            for row_idx, feature_value in enumerate(feature_values):
                result_row = result_rows[row_idx]
                result_row.fields[feature_name].CopyFrom(
                    python_value_to_proto_value(feature_value))
                result_row.statuses[
                    feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT

        for table, requested_features in grouped_refs:
            self._populate_result_rows_from_feature_view(
                entity_name_to_join_key_map,
                full_feature_names,
                provider,
                requested_features,
                result_rows,
                table,
                union_of_entity_keys,
            )

        initial_response = OnlineResponse(
            GetOnlineFeaturesResponse(field_values=result_rows))
        return self._augment_response_with_on_demand_transforms(
            _feature_refs, full_feature_names, initial_response, result_rows)

    def _populate_result_rows_from_feature_view(
        self,
        entity_name_to_join_key_map: Dict[str, str],
        full_feature_names: bool,
        provider: Provider,
        requested_features: List[str],
        result_rows: List[GetOnlineFeaturesResponse.FieldValues],
        table: FeatureView,
        union_of_entity_keys: List[EntityKeyProto],
    ):
        entity_keys = _get_table_entity_keys(table, union_of_entity_keys,
                                             entity_name_to_join_key_map)
        read_rows = provider.online_read(
            config=self.config,
            table=table,
            entity_keys=entity_keys,
            requested_features=requested_features,
        )
        # Each row is a set of features for a given entity key
        for row_idx, read_row in enumerate(read_rows):
            row_ts, feature_data = read_row
            result_row = result_rows[row_idx]

            if feature_data is None:
                for feature_name in requested_features:
                    feature_ref = (f"{table.name}__{feature_name}"
                                   if full_feature_names else feature_name)
                    result_row.statuses[
                        feature_ref] = GetOnlineFeaturesResponse.FieldStatus.NOT_FOUND
            else:
                for feature_name in feature_data:
                    feature_ref = (f"{table.name}__{feature_name}"
                                   if full_feature_names else feature_name)
                    if feature_name in requested_features:
                        result_row.fields[feature_ref].CopyFrom(
                            feature_data[feature_name])
                        result_row.statuses[
                            feature_ref] = GetOnlineFeaturesResponse.FieldStatus.PRESENT

    def _get_needed_request_data_features(self, grouped_odfv_refs) -> Set[str]:
        needed_request_data_features = set()
        for odfv_to_feature_names in grouped_odfv_refs:
            odfv, requested_feature_names = odfv_to_feature_names
            odfv_inputs = odfv.inputs.values()
            for odfv_input in odfv_inputs:
                if type(odfv_input) == RequestDataSource:
                    request_data_source = cast(RequestDataSource, odfv_input)
                    for feature_name in request_data_source.schema.keys():
                        needed_request_data_features.add(feature_name)
        return needed_request_data_features

    def _augment_response_with_on_demand_transforms(
        self,
        feature_refs: List[str],
        full_feature_names: bool,
        initial_response: OnlineResponse,
        result_rows: List[GetOnlineFeaturesResponse.FieldValues],
    ) -> OnlineResponse:
        all_on_demand_feature_views = {
            view.name: view
            for view in self._registry.list_on_demand_feature_views(
                project=self.project, allow_cache=True)
        }
        all_odfv_feature_names = all_on_demand_feature_views.keys()

        if len(all_on_demand_feature_views) == 0:
            return initial_response
        initial_response_df = initial_response.to_df()

        odfv_feature_refs = defaultdict(list)
        for feature_ref in feature_refs:
            view_name, feature_name = feature_ref.split(":")
            if view_name in all_odfv_feature_names:
                odfv_feature_refs[view_name].append(feature_name)

        # Apply on demand transformations
        for odfv_name, _feature_refs in odfv_feature_refs.items():
            odfv = all_on_demand_feature_views[odfv_name]
            transformed_features_df = odfv.get_transformed_features_df(
                full_feature_names, initial_response_df)
            for row_idx in range(len(result_rows)):
                result_row = result_rows[row_idx]

                selected_subset = [
                    f for f in transformed_features_df.columns
                    if f in _feature_refs
                ]

                for transformed_feature in selected_subset:
                    transformed_feature_name = (
                        f"{odfv.name}__{transformed_feature}"
                        if full_feature_names else transformed_feature)
                    proto_value = python_value_to_proto_value(
                        transformed_features_df[transformed_feature].
                        values[row_idx])
                    result_row.fields[transformed_feature_name].CopyFrom(
                        proto_value)
                    result_row.statuses[
                        transformed_feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT
        return OnlineResponse(
            GetOnlineFeaturesResponse(field_values=result_rows))

    @log_exceptions_and_usage
    def serve(self, port: int) -> None:
        """Start the feature consumption server locally on a given port."""
        if not flags_helper.enable_python_feature_server(self.config):
            raise ExperimentalFeatureNotEnabled(
                flags.FLAG_PYTHON_FEATURE_SERVER_NAME)

        feature_server.start_server(self, port)
Exemple #5
0
def get_feature_view_query_context(
    feature_refs: List[str],
    feature_views: List[FeatureView],
    registry: Registry,
    project: str,
    entity_df_timestamp_range: Tuple[datetime, datetime],
) -> List[FeatureViewQueryContext]:
    """
    Build a query context containing all information required to template a BigQuery and
    Redshift point-in-time SQL query
    """
    (
        feature_views_to_feature_map,
        on_demand_feature_views_to_features,
    ) = _get_requested_feature_views_to_features_dict(
        feature_refs, feature_views,
        registry.list_on_demand_feature_views(project))

    query_context = []
    for feature_view, features in feature_views_to_feature_map.items():
        join_keys, entity_selections = [], []
        for entity_name in feature_view.entities:
            entity = registry.get_entity(entity_name, project)
            join_key = feature_view.projection.join_key_map.get(
                entity.join_key, entity.join_key)
            join_keys.append(join_key)
            entity_selections.append(f"{entity.join_key} AS {join_key}")

        if isinstance(feature_view.ttl, timedelta):
            ttl_seconds = int(feature_view.ttl.total_seconds())
        else:
            ttl_seconds = 0

        reverse_field_mapping = {
            v: k
            for k, v in feature_view.batch_source.field_mapping.items()
        }
        features = [
            reverse_field_mapping.get(feature, feature) for feature in features
        ]
        timestamp_field = reverse_field_mapping.get(
            feature_view.batch_source.timestamp_field,
            feature_view.batch_source.timestamp_field,
        )
        created_timestamp_column = reverse_field_mapping.get(
            feature_view.batch_source.created_timestamp_column,
            feature_view.batch_source.created_timestamp_column,
        )

        max_event_timestamp = to_naive_utc(
            entity_df_timestamp_range[1]).isoformat()
        min_event_timestamp = None
        if feature_view.ttl:
            min_event_timestamp = to_naive_utc(entity_df_timestamp_range[0] -
                                               feature_view.ttl).isoformat()

        context = FeatureViewQueryContext(
            name=feature_view.projection.name_to_use(),
            ttl=ttl_seconds,
            entities=join_keys,
            features=features,
            field_mapping=feature_view.batch_source.field_mapping,
            event_timestamp_column=timestamp_field,
            created_timestamp_column=created_timestamp_column,
            # TODO: Make created column optional and not hardcoded
            table_subquery=feature_view.batch_source.get_table_query_string(),
            entity_selections=entity_selections,
            min_event_timestamp=min_event_timestamp,
            max_event_timestamp=max_event_timestamp,
        )
        query_context.append(context)

    return query_context
Exemple #6
0
    def get_historical_features(
        config: RepoConfig,
        feature_views: List[FeatureView],
        feature_refs: List[str],
        entity_df: Union[pd.DataFrame, str],
        registry: Registry,
        project: str,
        full_feature_names: bool = False,
    ) -> RetrievalJob:
        if not isinstance(entity_df, pd.DataFrame) and not isinstance(
                entity_df, dd.DataFrame):
            raise ValueError(
                f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}"
            )
        entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL  # local modifiable copy of global variable
        if entity_df_event_timestamp_col not in entity_df.columns:
            datetime_columns = entity_df.select_dtypes(
                include=["datetime", "datetimetz"]).columns
            if len(datetime_columns) == 1:
                print(
                    f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}."
                )
                entity_df_event_timestamp_col = datetime_columns[0]
            else:
                raise ValueError(
                    f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events."
                )
        (
            feature_views_to_features,
            on_demand_feature_views_to_features,
        ) = _get_requested_feature_views_to_features_dict(
            feature_refs,
            feature_views,
            registry.list_on_demand_feature_views(config.project),
        )

        entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range(
            entity_df, entity_df_event_timestamp_col)

        # Create lazy function that is only called from the RetrievalJob object
        def evaluate_historical_retrieval():

            # Create a copy of entity_df to prevent modifying the original
            entity_df_with_features = entity_df.copy()

            entity_df_event_timestamp_col_type = entity_df_with_features.dtypes[
                entity_df_event_timestamp_col]
            if (not hasattr(entity_df_event_timestamp_col_type, "tz")
                    or entity_df_event_timestamp_col_type.tz != pytz.UTC):
                # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC
                entity_df_with_features[
                    entity_df_event_timestamp_col] = entity_df_with_features[
                        entity_df_event_timestamp_col].apply(
                            lambda x: x if x.tzinfo is not None else x.replace(
                                tzinfo=pytz.utc))

                # Convert event timestamp column to datetime and normalize time zone to UTC
                # This is necessary to avoid issues with pd.merge_asof
                if isinstance(entity_df_with_features, dd.DataFrame):
                    entity_df_with_features[
                        entity_df_event_timestamp_col] = dd.to_datetime(
                            entity_df_with_features[
                                entity_df_event_timestamp_col],
                            utc=True)
                else:
                    entity_df_with_features[
                        entity_df_event_timestamp_col] = pd.to_datetime(
                            entity_df_with_features[
                                entity_df_event_timestamp_col],
                            utc=True)

            # Sort event timestamp values
            entity_df_with_features = entity_df_with_features.sort_values(
                entity_df_event_timestamp_col)

            join_keys = []
            all_join_keys = []

            # Load feature view data from sources and join them incrementally
            for feature_view, features in feature_views_to_features.items():
                event_timestamp_column = feature_view.batch_source.timestamp_field
                created_timestamp_column = (
                    feature_view.batch_source.created_timestamp_column)

                # Build a list of entity columns to join on (from the right table)
                join_keys = []

                for entity_name in feature_view.entities:
                    entity = registry.get_entity(entity_name, project)
                    join_key = feature_view.projection.join_key_map.get(
                        entity.join_key, entity.join_key)
                    join_keys.append(join_key)

                right_entity_key_columns = [
                    event_timestamp_column,
                    created_timestamp_column,
                ] + join_keys
                right_entity_key_columns = [
                    c for c in right_entity_key_columns if c
                ]

                all_join_keys = list(set(all_join_keys + join_keys))

                df_to_join = _read_datasource(feature_view.batch_source)

                df_to_join, event_timestamp_column = _field_mapping(
                    df_to_join,
                    feature_view,
                    features,
                    right_entity_key_columns,
                    entity_df_event_timestamp_col,
                    event_timestamp_column,
                    full_feature_names,
                )

                df_to_join = _merge(entity_df_with_features, df_to_join,
                                    join_keys)

                df_to_join = _normalize_timestamp(df_to_join,
                                                  event_timestamp_column,
                                                  created_timestamp_column)

                df_to_join = _filter_ttl(
                    df_to_join,
                    feature_view,
                    entity_df_event_timestamp_col,
                    event_timestamp_column,
                )

                df_to_join = _drop_duplicates(
                    df_to_join,
                    all_join_keys,
                    event_timestamp_column,
                    created_timestamp_column,
                    entity_df_event_timestamp_col,
                )

                entity_df_with_features = _drop_columns(
                    df_to_join, event_timestamp_column,
                    created_timestamp_column)

                # Ensure that we delete dataframes to free up memory
                del df_to_join

            return entity_df_with_features.persist()

        job = FileRetrievalJob(
            evaluation_function=evaluate_historical_retrieval,
            full_feature_names=full_feature_names,
            on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs(
                feature_refs, project, registry),
            metadata=RetrievalMetadata(
                features=feature_refs,
                keys=list(
                    set(entity_df.columns) - {entity_df_event_timestamp_col}),
                min_event_timestamp=entity_df_event_timestamp_range[0],
                max_event_timestamp=entity_df_event_timestamp_range[1],
            ),
        )
        return job