コード例 #1
0
    def get_online_features(
        self,
        feature_refs: List[str],
        entity_rows: List[Dict[str, Any]],
        project: Optional[str] = None,
    ) -> OnlineResponse:
        """
        Retrieves the latest online feature data from Feast Serving.
        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
                Only the feature name is required.
            entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair.
            project: Optionally specify the the project override. If specified, uses given project for retrieval.
                Overrides the projects specified in Feature References if also are specified.
        Returns:
            GetOnlineFeaturesResponse containing the feature data in records.
            Each EntityRow provided will yield one record, which contains
            data fields with data value and field status metadata (if included).
        Examples:
            >>> from feast import Client
            >>>
            >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
            >>> feature_refs = ["sales:daily_transactions"]
            >>> entity_rows = [{"customer_id": 0},{"customer_id": 1}]
            >>>
            >>> online_response = feast_client.get_online_features(
            >>>     feature_refs, entity_rows, project="my_project")
            >>> online_response_dict = online_response.to_dict()
            >>> print(online_response_dict)
            {'sales:daily_transactions': [1.1,1.2], 'sales:customer_id': [0,1]}
        """

        if self._telemetry_enabled:
            if self._telemetry_counter["get_online_features"] % 1000 == 0:
                log_usage(
                    "get_online_features",
                    self._telemetry_id,
                    datetime.utcnow(),
                    self.version(sdk_only=True),
                )
            self._telemetry_counter["get_online_features"] += 1
        try:
            response = self._serving_service.GetOnlineFeaturesV2(
                GetOnlineFeaturesRequestV2(
                    features=_build_feature_references(
                        feature_ref_strs=feature_refs),
                    entity_rows=_infer_online_entity_rows(entity_rows),
                    project=project if project is not None else self.project,
                ),
                timeout=self._config.getint(opt.GRPC_CONNECTION_TIMEOUT),
                metadata=self._get_grpc_metadata(),
            )
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())

        response = OnlineResponse(response)
        return response
コード例 #2
0
    def get_feature_table(self,
                          name: str,
                          project: str = None) -> FeatureTable:
        """
        Retrieves a feature table.

        Args:
            project: Feast project that this feature table belongs to
            name: Name of feature table

        Returns:
            Returns either the specified feature table, or raises an exception if
            none is found
        """

        if self._telemetry_enabled:
            log_usage(
                "get_feature_table",
                self._telemetry_id,
                datetime.utcnow(),
                self.version(sdk_only=True),
            )
        if project is None:
            project = self.project

        try:
            get_feature_table_response = self._core_service.GetFeatureTable(
                GetFeatureTableRequest(project=project, name=name.strip()),
                metadata=self._get_grpc_metadata(),
            )  # type: GetFeatureTableResponse
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())
        return FeatureTable.from_proto(get_feature_table_response.table)
コード例 #3
0
    def get_entity(self, name: str, project: str = None) -> Entity:
        """
        Retrieves an entity.

        Args:
            project: Feast project that this entity belongs to
            name: Name of entity

        Returns:
            Returns either the specified entity, or raises an exception if
            none is found
        """

        if self._telemetry_enabled:
            log_usage(
                "get_entity",
                self._telemetry_id,
                datetime.utcnow(),
                self.version(sdk_only=True),
            )
        if project is None:
            project = self.project

        try:
            get_entity_response = self._core_service.GetEntity(
                GetEntityRequest(project=project, name=name.strip()),
                metadata=self._get_grpc_metadata(),
            )  # type: GetEntityResponse
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())
        entity = Entity.from_proto(get_entity_response.entity)

        return entity
コード例 #4
0
    def apply(
        self,
        objects: Union[List[Union[Entity, FeatureTable]], Entity,
                       FeatureTable],
        project: str = None,
    ):
        """
        Idempotently registers entities and feature tables with Feast Core. Either a single
        entity or feature table or a list can be provided.

        Args:
            objects: List of entities and/or feature tables that will be registered

        Examples:
            >>> from feast import Client
            >>> from feast.entity import Entity
            >>> from feast.value_type import ValueType
            >>>
            >>> feast_client = Client(core_url="localhost:6565")
            >>> entity = Entity(
            >>>     name="driver_entity",
            >>>     description="Driver entity for car rides",
            >>>     value_type=ValueType.STRING,
            >>>     labels={
            >>>         "key": "val"
            >>>     }
            >>> )
            >>> feast_client.apply(entity)
        """

        if self._telemetry_enabled:
            log_usage(
                "apply",
                self._telemetry_id,
                datetime.utcnow(),
                self.version(sdk_only=True),
            )
        if project is None:
            project = self.project

        if not isinstance(objects, list):
            objects = [objects]
        for obj in objects:
            if isinstance(obj, Entity):
                self._apply_entity(project, obj)  # type: ignore
            elif isinstance(obj, FeatureTable):
                self._apply_feature_table(project, obj)  # type: ignore
            else:
                raise ValueError(
                    f"Could not determine object type to apply {obj} with type {type(obj)}. Type must be Entity or FeatureTable."
                )
コード例 #5
0
    def ingest(
        self,
        feature_table: Union[str, FeatureTable],
        source: Union[pd.DataFrame, str],
        project: str = None,
        chunk_size: int = 10000,
        max_workers: int = max(CPU_COUNT - 1, 1),
        timeout: int = int(opt().BATCH_INGESTION_PRODUCTION_TIMEOUT),
    ) -> None:
        """
        Batch load feature data into a FeatureTable.

        Args:
            feature_table (typing.Union[str, feast.feature_table.FeatureTable]):
                FeatureTable object or the string name of the feature table

            source (typing.Union[pd.DataFrame, str]):
                Either a file path or Pandas Dataframe to ingest into Feast
                Files that are currently supported:
                    * parquet
                    * csv
                    * json

            project: Feast project to locate FeatureTable

            chunk_size (int):
                Amount of rows to load and ingest at a time.

            max_workers (int):
                Number of worker processes to use to encode values.

            timeout (int):
                Timeout in seconds to wait for completion.

        Examples:
            >>> from feast import Client
            >>>
            >>> client = Client(core_url="localhost:6565")
            >>> ft_df = pd.DataFrame(
            >>>         {
            >>>            "datetime": [pd.datetime.now()],
            >>>            "driver": [1001],
            >>>            "rating": [4.3],
            >>>         }
            >>>     )
            >>> client.set_project("project1")
            >>>
            >>> driver_ft = client.get_feature_table("driver")
            >>> client.ingest(driver_ft, ft_df)
        """

        if self._telemetry_enabled:
            log_usage(
                "ingest",
                self._telemetry_id,
                datetime.utcnow(),
                self.version(sdk_only=True),
            )
        if project is None:
            project = self.project
        if isinstance(feature_table, str):
            name = feature_table
        if isinstance(feature_table, FeatureTable):
            name = feature_table.name

        fetched_feature_table: Optional[FeatureTable] = self.get_feature_table(
            name, project)
        if fetched_feature_table is not None:
            feature_table = fetched_feature_table
        else:
            raise Exception(f"FeatureTable, {name} cannot be found.")

        # Check 1) Only parquet file format for FeatureTable batch source is supported
        if (feature_table.batch_source
                and issubclass(type(feature_table.batch_source), FileSource)
                and isinstance(
                    type(feature_table.batch_source.file_options.file_format),
                    ParquetFormat)):
            raise Exception(
                f"No suitable batch source found for FeatureTable, {name}."
                f"Only BATCH_FILE source with parquet format is supported for batch ingestion."
            )

        pyarrow_table, column_names = _read_table_from_source(source)
        # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table
        _check_field_mappings(
            column_names,
            name,
            feature_table.batch_source.event_timestamp_column,
            feature_table.batch_source.field_mapping,
        )

        dir_path = None
        with_partitions = False
        if (issubclass(type(feature_table.batch_source), FileSource)
                and feature_table.batch_source.date_partition_column):
            with_partitions = True
            dest_path = _write_partitioned_table_from_source(
                column_names,
                pyarrow_table,
                feature_table.batch_source.date_partition_column,
                feature_table.batch_source.event_timestamp_column,
            )
        else:
            dir_path, dest_path = _write_non_partitioned_table_from_source(
                column_names,
                pyarrow_table,
                chunk_size,
                max_workers,
            )

        try:
            if issubclass(type(feature_table.batch_source), FileSource):
                file_url = feature_table.batch_source.file_options.file_url.rstrip(
                    "*")
                _upload_to_file_source(file_url, with_partitions, dest_path,
                                       self._config)
            if issubclass(type(feature_table.batch_source), BigQuerySource):
                bq_table_ref = feature_table.batch_source.bigquery_options.table_ref
                feature_table_timestamp_column = (
                    feature_table.batch_source.event_timestamp_column)

                _upload_to_bq_source(bq_table_ref,
                                     feature_table_timestamp_column, dest_path)
        finally:
            # Remove parquet file(s) that were created earlier
            print("Removing temporary file(s)...")
            if dir_path:
                shutil.rmtree(dir_path)

        print(
            "Data has been successfully ingested into FeatureTable batch source."
        )
コード例 #6
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        output_location: Optional[str] = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be staged
                to become accessible by spark workers.
                If one of feature tables' source is in BigQuery - entities will be upload to BQ.
                Otherwise to remote file storage (derived from configured staging location).
                It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            destination_path: Specifies the path in a bucket to write the exported feature data files

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from feast.data_format import ParquetFormat
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", ParquetFormat(), "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source)
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        if self._telemetry_enabled:
            log_usage(
                "get_historical_features",
                self._telemetry_id,
                datetime.utcnow(),
                self.version(sdk_only=True),
            )
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, self.project)

        assert all(
            ft.batch_source.created_timestamp_column
            for ft in feature_tables), (
                "All BatchSources attached to retrieved FeatureTables "
                "must have specified `created_timestamp_column` to be used in "
                "historical dataset generation.")

        if output_location is None:
            output_location = os.path.join(
                self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_LOCATION),
                str(uuid.uuid4()),
            )
        output_format = self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_FORMAT)
        feature_sources = [
            feature_table.batch_source for feature_table in feature_tables
        ]

        if isinstance(entity_source, pd.DataFrame):
            if any(
                    isinstance(source, BigQuerySource)
                    for source in feature_sources):
                first_bq_source = [
                    source for source in feature_sources
                    if isinstance(source, BigQuerySource)
                ][0]
                source_ref = table_reference_from_string(
                    first_bq_source.bigquery_options.table_ref)
                entity_source = stage_entities_to_bq(entity_source,
                                                     source_ref.project,
                                                     source_ref.dataset_id)
            else:
                entity_source = stage_entities_to_fs(
                    entity_source,
                    staging_location=self._config.get(
                        opt.SPARK_STAGING_LOCATION),
                    config=self._config,
                )

        if self._use_job_service:
            response = self._job_service.GetHistoricalFeatures(
                GetHistoricalFeaturesRequest(
                    feature_refs=feature_refs,
                    entity_source=entity_source.to_proto(),
                    project=self.project,
                    output_format=output_format,
                    output_location=output_location,
                ),
                **self._extra_grpc_params(),
            )
            return RemoteRetrievalJob(
                self._job_service,
                self._extra_grpc_params,
                response.id,
                output_file_uri=response.output_file_uri,
                start_time=response.job_start_time.ToDatetime(),
                log_uri=response.log_uri,
            )
        else:
            return start_historical_feature_retrieval_job(
                client=self,
                project=self.project,
                entity_source=entity_source,
                feature_tables=feature_tables,
                output_format=output_format,
                output_path=output_location,
            )