Beispiel #1
0
class Client:
    """
    Feast Client: Used for creating, managing, and retrieving features.
    """
    def __init__(self, options: Optional[Dict[str, str]] = None, **kwargs):
        """
        The Feast Client should be initialized with at least one service url

        Args:
            core_url: Feast Core URL. Used to manage features
            serving_url: Feast Serving URL. Used to retrieve features
            project: Sets the active project. This field is optional.
            core_secure: Use client-side SSL/TLS for Core gRPC API
            serving_secure: Use client-side SSL/TLS for Serving gRPC API
            options: Configuration options to initialize client with
            **kwargs: Additional keyword arguments that will be used as
                configuration options along with "options"
        """

        if options is None:
            options = dict()
        self._config = Config(options={**options, **kwargs})

        self.__core_channel: grpc.Channel = None
        self.__serving_channel: grpc.Channel = None
        self._core_service_stub: CoreServiceStub = None
        self._serving_service_stub: ServingServiceStub = None

    @property
    def core_url(self) -> str:
        """
        Retrieve Feast Core URL

        Returns:
            Feast Core URL string
        """
        return self._config.get(CONFIG_CORE_URL_KEY)

    @core_url.setter
    def core_url(self, value: str):
        """
        Set the Feast Core URL

        Args:
            value: Feast Core URL
        """
        self._config.set(CONFIG_CORE_URL_KEY, value)

    @property
    def serving_url(self) -> str:
        """
        Retrieve Serving Core URL

        Returns:
            Feast Serving URL string
        """
        return self._config.get(CONFIG_SERVING_URL_KEY)

    @serving_url.setter
    def serving_url(self, value: str):
        """
        Set the Feast Serving URL

        Args:
            value: Feast Serving URL
        """
        self._config.set(CONFIG_SERVING_URL_KEY, value)

    @property
    def core_secure(self) -> bool:
        """
        Retrieve Feast Core client-side SSL/TLS setting

        Returns:
            Whether client-side SSL/TLS is enabled
        """
        return self._config.getboolean(CONFIG_CORE_SECURE_KEY)

    @core_secure.setter
    def core_secure(self, value: bool):
        """
        Set the Feast Core client-side SSL/TLS setting

        Args:
            value: True to enable client-side SSL/TLS
        """
        self._config.set(CONFIG_CORE_SECURE_KEY, value)

    @property
    def serving_secure(self) -> bool:
        """
        Retrieve Feast Serving client-side SSL/TLS setting

        Returns:
            Whether client-side SSL/TLS is enabled
        """
        return self._config.getboolean(CONFIG_SERVING_SECURE_KEY)

    @serving_secure.setter
    def serving_secure(self, value: bool):
        """
        Set the Feast Serving client-side SSL/TLS setting

        Args:
            value: True to enable client-side SSL/TLS
        """
        self._config.set(CONFIG_SERVING_SECURE_KEY, value)

    def version(self):
        """
        Returns version information from Feast Core and Feast Serving
        """
        result = {}

        if self.serving_url:
            self._connect_serving()
            serving_version = self._serving_service_stub.GetFeastServingInfo(
                GetFeastServingInfoRequest(),
                timeout=self._config.getint(
                    CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
            ).version
            result["serving"] = {
                "url": self.serving_url,
                "version": serving_version
            }

        if self.core_url:
            self._connect_core()
            core_version = self._core_service_stub.GetFeastCoreVersion(
                GetFeastCoreVersionRequest(),
                timeout=self._config.getint(
                    CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
            ).version
            result["core"] = {"url": self.core_url, "version": core_version}

        return result

    def _connect_core(self, skip_if_connected: bool = True):
        """
        Connect to Core API

        Args:
            skip_if_connected: Do not attempt to connect if already connected
        """
        if skip_if_connected and self._core_service_stub:
            return

        if not self.core_url:
            raise ValueError("Please set Feast Core URL.")

        if self.__core_channel is None:
            if self.core_secure or self.core_url.endswith(":443"):
                self.__core_channel = grpc.secure_channel(
                    self.core_url, grpc.ssl_channel_credentials())
            else:
                self.__core_channel = grpc.insecure_channel(self.core_url)

        try:
            grpc.channel_ready_future(
                self.__core_channel).result(timeout=self._config.getint(
                    CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY))
        except grpc.FutureTimeoutError:
            raise ConnectionError(
                f"Connection timed out while attempting to connect to Feast "
                f"Core gRPC server {self.core_url} ")
        else:
            self._core_service_stub = CoreServiceStub(self.__core_channel)

    def _connect_serving(self, skip_if_connected=True):
        """
        Connect to Serving API

        Args:
            skip_if_connected: Do not attempt to connect if already connected
        """

        if skip_if_connected and self._serving_service_stub:
            return

        if not self.serving_url:
            raise ValueError("Please set Feast Serving URL.")

        if self.__serving_channel is None:
            if self.serving_secure or self.serving_url.endswith(":443"):
                self.__serving_channel = grpc.secure_channel(
                    self.serving_url, grpc.ssl_channel_credentials())
            else:
                self.__serving_channel = grpc.insecure_channel(
                    self.serving_url)

        try:
            grpc.channel_ready_future(
                self.__serving_channel).result(timeout=self._config.getint(
                    CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY))
        except grpc.FutureTimeoutError:
            raise ConnectionError(
                f"Connection timed out while attempting to connect to Feast "
                f"Serving gRPC server {self.serving_url} ")
        else:
            self._serving_service_stub = ServingServiceStub(
                self.__serving_channel)

    @property
    def project(self) -> Union[str, None]:
        """
        Retrieve currently active project

        Returns:
            Project name
        """
        return self._config.get(CONFIG_PROJECT_KEY)

    def set_project(self, project: str):
        """
        Set currently active Feast project

        Args:
            project: Project to set as active
        """
        self._config.set(CONFIG_PROJECT_KEY, project)

    def list_projects(self) -> List[str]:
        """
        List all active Feast projects

        Returns:
            List of project names

        """
        self._connect_core()
        response = self._core_service_stub.ListProjects(
            ListProjectsRequest(),
            timeout=self._config.getint(
                CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
        )  # type: ListProjectsResponse
        return list(response.projects)

    def create_project(self, project: str):
        """
        Creates a Feast project

        Args:
            project: Name of project
        """

        self._connect_core()
        self._core_service_stub.CreateProject(
            CreateProjectRequest(name=project),
            timeout=self._config.getint(
                CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
        )  # type: CreateProjectResponse

    def archive_project(self, project):
        """
        Archives a project. Project will still continue to function for
        ingestion and retrieval, but will be in a read-only state. It will
        also not be visible from the Core API for management purposes.

        Args:
            project: Name of project to archive
        """

        self._connect_core()
        self._core_service_stub.ArchiveProject(
            ArchiveProjectRequest(name=project),
            timeout=self._config.getint(
                CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
        )  # type: ArchiveProjectResponse

        if self._project == project:
            self._project = ""

    def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]):
        """
        Idempotently registers feature set(s) with Feast Core. Either a single
        feature set or a list can be provided.

        Args:
            feature_sets: List of feature sets that will be registered
        """
        if not isinstance(feature_sets, list):
            feature_sets = [feature_sets]
        for feature_set in feature_sets:
            if isinstance(feature_set, FeatureSet):
                self._apply_feature_set(feature_set)
                continue
            raise ValueError(
                f"Could not determine feature set type to apply {feature_set}")

    def _apply_feature_set(self, feature_set: FeatureSet):
        """
        Registers a single feature set with Feast

        Args:
            feature_set: Feature set that will be registered
        """
        self._connect_core()

        feature_set.is_valid()
        feature_set_proto = feature_set.to_proto()
        if len(feature_set_proto.spec.project) == 0:
            if self.project is None:
                raise ValueError(
                    f"No project found in feature set {feature_set.name}. "
                    f"Please set the project within the feature set or within "
                    f"your Feast Client.")
            else:
                feature_set_proto.spec.project = self.project

        # Convert the feature set to a request and send to Feast Core
        try:
            apply_fs_response = self._core_service_stub.ApplyFeatureSet(
                ApplyFeatureSetRequest(feature_set=feature_set_proto),
                timeout=self._config.getint(
                    CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
            )  # type: ApplyFeatureSetResponse
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())

        # Extract the returned feature set
        applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set)

        # If the feature set has changed, update the local copy
        if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED:
            print(
                f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"'
            )

        # If no change has been applied, do nothing
        if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE:
            print(f"No change detected or applied: {feature_set.name}")

        # Deep copy from the returned feature set to the local feature set
        feature_set._update_from_feature_set(applied_fs)

    def list_feature_sets(self,
                          project: str = None,
                          name: str = None,
                          version: str = None) -> List[FeatureSet]:
        """
        Retrieve a list of feature sets from Feast Core

        Args:
            project: Filter feature sets based on project name
            name: Filter feature sets based on feature set name
            version: Filter feature sets based on version numbf,

        Returns:
            List of feature sets
        """
        self._connect_core()

        if project is None:
            if self.project is not None:
                project = self.project
            else:
                project = "*"

        if name is None:
            name = "*"

        if version is None:
            version = "*"

        filter = ListFeatureSetsRequest.Filter(project=project,
                                               feature_set_name=name,
                                               feature_set_version=version)

        # Get latest feature sets from Feast Core
        feature_set_protos = self._core_service_stub.ListFeatureSets(
            ListFeatureSetsRequest(
                filter=filter))  # type: ListFeatureSetsResponse

        # Extract feature sets and return
        feature_sets = []
        for feature_set_proto in feature_set_protos.feature_sets:
            feature_set = FeatureSet.from_proto(feature_set_proto)
            feature_set._client = self
            feature_sets.append(feature_set)
        return feature_sets

    def get_feature_set(self,
                        name: str,
                        version: int = None,
                        project: str = None) -> Union[FeatureSet, None]:
        """
        Retrieves a feature set. If no version is specified then the latest
        version will be returned.

        Args:
            project: Feast project that this feature set belongs to
            name: Name of feature set
            version: Version of feature set

        Returns:
            Returns either the specified feature set, or raises an exception if
            none is found
        """
        self._connect_core()

        if project is None:
            if self.project is not None:
                project = self.project
            else:
                raise ValueError("No project has been configured.")

        if version is None:
            version = 0

        try:
            get_feature_set_response = self._core_service_stub.GetFeatureSet(
                GetFeatureSetRequest(
                    project=project, name=name.strip(),
                    version=int(version)))  # type: GetFeatureSetResponse
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())
        return FeatureSet.from_proto(get_feature_set_response.feature_set)

    def list_entities(self) -> Dict[str, Entity]:
        """
        Returns a dictionary of entities across all feature sets

        Returns:
            Dictionary of entities, indexed by name
        """
        entities_dict = OrderedDict()
        for fs in self.list_feature_sets():
            for entity in fs.entities:
                entities_dict[entity.name] = entity
        return entities_dict

    def get_batch_features(
        self,
        feature_refs: List[str],
        entity_rows: Union[pd.DataFrame, str],
        default_project: str = None,
    ) -> RetrievalJob:
        """
        Retrieves historical features from a Feast Serving deployment.

        Args:
            feature_refs (List[str]):
                List of feature references that will be returned for each entity.
                Each feature reference should have the following format
                "project/feature:version".

            entity_rows (Union[pd.DataFrame, str]):
                Pandas dataframe containing entities and a 'datetime' column.
                Each entity in a feature set must be present as a column in this
                dataframe. The datetime column must contain timestamps in
                datetime64 format.
            default_project: Default project where feature values will be found.

        Returns:
            feast.job.RetrievalJob:
                Returns a retrival job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>>
            >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
            >>> feature_refs = ["my_project/bookings_7d:1", "booking_14d"]
            >>> entity_rows = pd.DataFrame(
            >>>         {
            >>>            "datetime": [pd.datetime.now() for _ in range(3)],
            >>>            "customer": [1001, 1002, 1003],
            >>>         }
            >>>     )
            >>> feature_retrieval_job = feast_client.get_batch_features(
            >>>     feature_refs, entity_rows, default_project="my_project")
            >>> df = feature_retrieval_job.to_dataframe()
            >>> print(df)
        """

        self._connect_serving()

        feature_references = _build_feature_references(
            feature_refs=feature_refs, default_project=default_project)

        # Retrieve serving information to determine store type and
        # staging location
        serving_info = self._serving_service_stub.GetFeastServingInfo(
            GetFeastServingInfoRequest(),
            timeout=self._config.getint(
                CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
        )  # type: GetFeastServingInfoResponse

        if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH:
            raise Exception(
                f'You are connected to a store "{self._serving_url}" which '
                f"does not support batch retrieval ")

        if isinstance(entity_rows, pd.DataFrame):
            # Pandas DataFrame detected

            # Remove timezone from datetime column
            if isinstance(entity_rows["datetime"].dtype,
                          pd.core.dtypes.dtypes.DatetimeTZDtype):
                entity_rows["datetime"] = pd.DatetimeIndex(
                    entity_rows["datetime"]).tz_localize(None)
        elif isinstance(entity_rows, str):
            # String based source
            if not entity_rows.endswith((".avro", "*")):
                raise Exception(
                    f"Only .avro and wildcard paths are accepted as entity_rows"
                )
        else:
            raise Exception(f"Only pandas.DataFrame and str types are allowed"
                            f" as entity_rows, but got {type(entity_rows)}.")

        # Export and upload entity row DataFrame to staging location
        # provided by Feast
        staged_files = export_source_to_staging_location(
            entity_rows, serving_info.job_staging_location)  # type: List[str]

        request = GetBatchFeaturesRequest(
            features=feature_references,
            dataset_source=DatasetSource(file_source=DatasetSource.FileSource(
                file_uris=staged_files,
                data_format=DataFormat.DATA_FORMAT_AVRO)),
        )

        # Retrieve Feast Job object to manage life cycle of retrieval
        response = self._serving_service_stub.GetBatchFeatures(request)
        return RetrievalJob(response.job, self._serving_service_stub)

    def get_online_features(
        self,
        feature_refs: List[str],
        entity_rows: List[GetOnlineFeaturesRequest.EntityRow],
        default_project: Optional[str] = None,
    ) -> GetOnlineFeaturesResponse:
        """
        Retrieves the latest online feature data from Feast Serving

        Args:
            feature_refs: List of feature references in the following format
                [project]/[feature_name]:[version]. Only the feature name
                is a required component in the reference.
                example:
                    ["my_project/my_feature_1:3",
                    "my_project3/my_feature_4:1",]
            entity_rows: List of GetFeaturesRequest.EntityRow where each row
                contains entities. Timestamp should not be set for online
                retrieval. All entity types within a feature
            default_project: This project will be used if the project name is
                not provided in the feature reference

        Returns:
            Returns a list of maps where each item in the list contains the
            latest feature values for the provided entities
        """
        self._connect_serving()

        return self._serving_service_stub.GetOnlineFeatures(
            GetOnlineFeaturesRequest(
                features=_build_feature_references(
                    feature_refs=feature_refs,
                    default_project=(default_project
                                     if not self.project else self.project),
                ),
                entity_rows=entity_rows,
            ))

    def list_ingest_jobs(
        self,
        job_id: str = None,
        feature_set_ref: FeatureSetRef = None,
        store_name: str = None,
    ):
        """
        List the ingestion jobs currently registered in Feast, with optional filters.
        Provides detailed metadata about each ingestion job.

        Args:
            job_id: Select specific ingestion job with the given job_id
            feature_set_ref: Filter ingestion jobs by target feature set (via reference)
            store_name: Filter ingestion jobs by target feast store's name

        Returns:
            List of IngestJobs matching the given filters
        """
        self._connect_core()
        # construct list request
        feature_set_ref = None
        list_filter = ListIngestionJobsRequest.Filter(
            id=job_id,
            feature_set_reference=feature_set_ref,
            store_name=store_name,
        )
        request = ListIngestionJobsRequest(filter=list_filter)
        # make list request & unpack response
        response = self._core_service_stub.ListIngestionJobs(request)
        ingest_jobs = [
            IngestJob(proto, self._core_service_stub)
            for proto in response.jobs
        ]
        return ingest_jobs

    def restart_ingest_job(self, job: IngestJob):
        """
        Restart ingestion job currently registered in Feast.
        NOTE: Data might be lost during the restart for some job runners.
        Does not support stopping a job in a transitional (ie pending, suspending, aborting),
        terminal state (ie suspended or aborted) or unknown status

        Args:
            job: IngestJob to restart
        """
        self._connect_core()
        request = RestartIngestionJobRequest(id=job.id)
        try:
            self._core_service_stub.RestartIngestionJob(request)
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())

    def stop_ingest_job(self, job: IngestJob):
        """
        Stop ingestion job currently resgistered in Feast
        Does nothing if the target job if already in a terminal state (ie suspended or aborted).
        Does not support stopping a job in a transitional (ie pending, suspending, aborting)
        or in a unknown status

        Args:
            job: IngestJob to restart
        """
        self._connect_core()
        request = StopIngestionJobRequest(id=job.id)
        try:
            self._core_service_stub.StopIngestionJob(request)
        except grpc.RpcError as e:
            raise grpc.RpcError(e.details())

    def ingest(
        self,
        feature_set: Union[str, FeatureSet],
        source: Union[pd.DataFrame, str],
        chunk_size: int = 10000,
        version: int = None,
        force_update: bool = False,
        max_workers: int = max(CPU_COUNT - 1, 1),
        disable_progress_bar: bool = False,
        timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT,
    ) -> None:
        """
        Loads feature data into Feast for a specific feature set.

        Args:
            feature_set (typing.Union[str, feast.feature_set.FeatureSet]):
                Feature set object or the string name of the feature set
                (without a version).

            source (typing.Union[pd.DataFrame, str]):
                Either a file path or Pandas Dataframe to ingest into Feast
                Files that are currently supported:
                    * parquet
                    * csv
                    * json

            chunk_size (int):
                Amount of rows to load and ingest at a time.

            version (int):
                Feature set version.

            force_update (bool):
                Automatically update feature set based on source data prior to
                ingesting. This will also register changes to Feast.

            max_workers (int):
                Number of worker processes to use to encode values.

            disable_progress_bar (bool):
                Disable printing of progress statistics.

            timeout (int):
                Timeout in seconds to wait for completion.

        Returns:
            None:
                None
        """

        if isinstance(feature_set, FeatureSet):
            name = feature_set.name
            if version is None:
                version = feature_set.version
        elif isinstance(feature_set, str):
            name = feature_set
        else:
            raise Exception(f"Feature set name must be provided")

        # Read table and get row count
        dir_path, dest_path = _read_table_from_source(source, chunk_size,
                                                      max_workers)

        pq_file = pq.ParquetFile(dest_path)

        row_count = pq_file.metadata.num_rows

        # Update the feature set based on PyArrow table of first row group
        if force_update:
            feature_set.infer_fields_from_pa(
                table=pq_file.read_row_group(0),
                discard_unused_fields=True,
                replace_existing_features=True,
            )
            self.apply(feature_set)
        current_time = time.time()

        print("Waiting for feature set to be ready for ingestion...")
        while True:
            if timeout is not None and time.time() - current_time >= timeout:
                raise TimeoutError(
                    "Timed out waiting for feature set to be ready")
            feature_set = self.get_feature_set(name, version)
            if (feature_set is not None
                    and feature_set.status == FeatureSetStatus.STATUS_READY):
                break
            time.sleep(3)

        if timeout is not None:
            timeout = timeout - int(time.time() - current_time)

        try:
            # Kafka configs
            brokers = feature_set.get_kafka_source_brokers()
            topic = feature_set.get_kafka_source_topic()
            producer = get_producer(brokers, row_count, disable_progress_bar)

            # Loop optimization declarations
            produce = producer.produce
            flush = producer.flush

            # Transform and push data to Kafka
            if feature_set.source.source_type == "Kafka":
                for chunk in get_feature_row_chunks(
                        file=dest_path,
                        row_groups=list(range(pq_file.num_row_groups)),
                        fs=feature_set,
                        max_workers=max_workers,
                ):

                    # Push FeatureRow one chunk at a time to kafka
                    for serialized_row in chunk:
                        produce(topic=topic, value=serialized_row)

                    # Force a flush after each chunk
                    flush(timeout=timeout)

                    # Remove chunk from memory
                    del chunk

            else:
                raise Exception(
                    f"Could not determine source type for feature set "
                    f'"{feature_set.name}" with source type '
                    f'"{feature_set.source.source_type}"')

            # Print ingestion statistics
            producer.print_results()
        finally:
            # Remove parquet file(s) that were created earlier
            print("Removing temporary file(s)...")
            shutil.rmtree(dir_path)

        return None
Beispiel #2
0
class Client:
    """
    Feast Client: Used for creating, managing, and retrieving features.
    """
    def __init__(self,
                 core_url: str = None,
                 serving_url: str = None,
                 verbose: bool = False):
        """
        The Feast Client should be initialized with at least one service url

        Args:
            core_url: Feast Core URL. Used to manage features
            serving_url: Feast Serving URL. Used to retrieve features
            verbose: Enable verbose logging
        """
        self._core_url = core_url
        self._serving_url = serving_url
        self._verbose = verbose
        self.__core_channel: grpc.Channel = None
        self.__serving_channel: grpc.Channel = None
        self._core_service_stub: CoreServiceStub = None
        self._serving_service_stub: ServingServiceStub = None

    @property
    def core_url(self) -> str:
        """
        Retrieve Feast Core URL
        """

        if self._core_url is not None:
            return self._core_url
        if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None:
            return os.getenv(FEAST_CORE_URL_ENV_KEY)
        return ""

    @core_url.setter
    def core_url(self, value: str):
        """
        Set the Feast Core URL

        Returns:
            Feast Core URL string
        """
        self._core_url = value

    @property
    def serving_url(self) -> str:
        """
        Retrieve Serving Core URL
        """
        if self._serving_url is not None:
            return self._serving_url
        if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None:
            return os.getenv(FEAST_SERVING_URL_ENV_KEY)
        return ""

    @serving_url.setter
    def serving_url(self, value: str):
        """
        Set the Feast Serving URL

        Returns:
            Feast Serving URL string
        """
        self._serving_url = value

    def version(self):
        """
        Returns version information from Feast Core and Feast Serving
        """
        result = {}

        if self.serving_url:
            self._connect_serving()
            serving_version = self._serving_service_stub.GetFeastServingInfo(
                GetFeastServingInfoRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version
            result["serving"] = {
                "url": self.serving_url,
                "version": serving_version
            }

        if self.core_url:
            self._connect_core()
            core_version = self._core_service_stub.GetFeastCoreVersion(
                GetFeastCoreVersionRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version
            result["core"] = {"url": self.core_url, "version": core_version}

        return result

    def _connect_core(self, skip_if_connected: bool = True):
        """
        Connect to Core API

        Args:
            skip_if_connected: Do not attempt to connect if already connected
        """
        if skip_if_connected and self._core_service_stub:
            return

        if not self.core_url:
            raise ValueError("Please set Feast Core URL.")

        if self.__core_channel is None:
            self.__core_channel = grpc.insecure_channel(self.core_url)

        try:
            grpc.channel_ready_future(self.__core_channel).result(
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT)
        except grpc.FutureTimeoutError:
            raise ConnectionError(
                f"Connection timed out while attempting to connect to Feast "
                f"Core gRPC server {self.core_url} ")
        else:
            self._core_service_stub = CoreServiceStub(self.__core_channel)

    def _connect_serving(self, skip_if_connected=True):
        """
        Connect to Serving API

        Args:
            skip_if_connected: Do not attempt to connect if already connected
        """

        if skip_if_connected and self._serving_service_stub:
            return

        if not self.serving_url:
            raise ValueError("Please set Feast Serving URL.")

        if self.__serving_channel is None:
            self.__serving_channel = grpc.insecure_channel(self.serving_url)

        try:
            grpc.channel_ready_future(self.__serving_channel).result(
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT)
        except grpc.FutureTimeoutError:
            raise ConnectionError(
                f"Connection timed out while attempting to connect to Feast "
                f"Serving gRPC server {self.serving_url} ")
        else:
            self._serving_service_stub = ServingServiceStub(
                self.__serving_channel)

    def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]):
        """
        Idempotently registers feature set(s) with Feast Core. Either a single
        feature set or a list can be provided.

        Args:
            feature_sets: List of feature sets that will be registered
        """
        if not isinstance(feature_sets, list):
            feature_sets = [feature_sets]
        for feature_set in feature_sets:
            if isinstance(feature_set, FeatureSet):
                self._apply_feature_set(feature_set)
                continue
            raise ValueError(
                f"Could not determine feature set type to apply {feature_set}")

    def _apply_feature_set(self, feature_set: FeatureSet):
        """
        Registers a single feature set with Feast

        Args:
            feature_set: Feature set that will be registered
        """
        self._connect_core()
        feature_set._client = self

        feature_set.is_valid()

        # Convert the feature set to a request and send to Feast Core
        apply_fs_response = self._core_service_stub.ApplyFeatureSet(
            ApplyFeatureSetRequest(feature_set=feature_set.to_proto()),
            timeout=GRPC_CONNECTION_TIMEOUT_APPLY,
        )  # type: ApplyFeatureSetResponse

        # Extract the returned feature set
        applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set)

        # If the feature set has changed, update the local copy
        if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED:
            print(
                f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"'
            )

        # If no change has been applied, do nothing
        if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE:
            print(f"No change detected or applied: {feature_set.name}")

        # Deep copy from the returned feature set to the local feature set
        feature_set._update_from_feature_set(applied_fs)

    def list_feature_sets(self) -> List[FeatureSet]:
        """
        Retrieve a list of feature sets from Feast Core

        Returns:
            List of feature sets
        """
        self._connect_core()

        # Get latest feature sets from Feast Core
        feature_set_protos = self._core_service_stub.ListFeatureSets(
            ListFeatureSetsRequest())  # type: ListFeatureSetsResponse

        # Extract feature sets and return
        feature_sets = []
        for feature_set_proto in feature_set_protos.feature_sets:
            feature_set = FeatureSet.from_proto(feature_set_proto)
            feature_set._client = self
            feature_sets.append(feature_set)
        return feature_sets

    def get_feature_set(self,
                        name: str,
                        version: int = None) -> Union[FeatureSet, None]:
        """
        Retrieves a feature set. If no version is specified then the latest
        version will be returned.

        Args:
            name: Name of feature set
            version: Version of feature set

        Returns:
            Returns either the specified feature set, or raises an exception if
            none is found
        """
        self._connect_core()

        if version is None:
            version = 0
        get_feature_set_response = self._core_service_stub.GetFeatureSet(
            GetFeatureSetRequest(
                name=name.strip(),
                version=int(version)))  # type: GetFeatureSetResponse
        return FeatureSet.from_proto(get_feature_set_response.feature_set)

    def list_entities(self) -> Dict[str, Entity]:
        """
        Returns a dictionary of entities across all feature sets

        Returns:
            Dictionary of entities, indexed by name
        """
        entities_dict = OrderedDict()
        for fs in self.list_feature_sets():
            for entity in fs.entities:
                entities_dict[entity.name] = entity
        return entities_dict

    def get_batch_features(self, feature_ids: List[str],
                           entity_rows: Union[pd.DataFrame, str]) -> Job:
        """
        Retrieves historical features from a Feast Serving deployment.

        Args:
            feature_ids (List[str]):
                List of feature ids that will be returned for each entity.
                Each feature id should have the following format
                "feature_set_name:version:feature_name".

            entity_rows (Union[pd.DataFrame, str]):
                Pandas dataframe containing entities and a 'datetime' column.
                Each entity in a feature set must be present as a column in this
                dataframe. The datetime column must contain timestamps in
                datetime64 format.

        Returns:
            feast.job.Job:
                Returns a job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>>
            >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
            >>> feature_ids = ["customer:1:bookings_7d"]
            >>> entity_rows = pd.DataFrame(
            >>>         {
            >>>            "datetime": [pd.datetime.now() for _ in range(3)],
            >>>            "customer": [1001, 1002, 1003],
            >>>         }
            >>>     )
            >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows)
            >>> df = feature_retrieval_job.to_dataframe()
            >>> print(df)
        """

        self._connect_serving()

        fs_request = _build_feature_set_request(feature_ids)

        # Retrieve serving information to determine store type and
        # staging location
        serving_info = self._serving_service_stub.GetFeastServingInfo(
            GetFeastServingInfoRequest(),
            timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
        )  # type: GetFeastServingInfoResponse

        if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH:
            raise Exception(
                f'You are connected to a store "{self._serving_url}" which '
                f"does not support batch retrieval ")

        if isinstance(entity_rows, pd.DataFrame):
            # Pandas DataFrame detected
            # Validate entity rows to based on entities in Feast Core
            self._validate_dataframe_for_batch_retrieval(
                entity_rows=entity_rows, feature_sets_request=fs_request)

            # Remove timezone from datetime column
            if isinstance(entity_rows["datetime"].dtype,
                          pd.core.dtypes.dtypes.DatetimeTZDtype):
                entity_rows["datetime"] = pd.DatetimeIndex(
                    entity_rows["datetime"]).tz_localize(None)
        elif isinstance(entity_rows, str):
            # String based source
            if entity_rows.endswith((".avro", "*")):
                # Validate Avro entity rows to based on entities in Feast Core
                self._validate_avro_for_batch_retrieval(
                    source=entity_rows, feature_sets_request=fs_request)
            else:
                raise Exception(
                    f"Only .avro and wildcard paths are accepted as entity_rows"
                )
        else:
            raise Exception(f"Only pandas.DataFrame and str types are allowed"
                            f" as entity_rows, but got {type(entity_rows)}.")

        # Export and upload entity row DataFrame to staging location
        # provided by Feast
        staged_files = export_source_to_staging_location(
            entity_rows, serving_info.job_staging_location)  # type: List[str]

        request = GetBatchFeaturesRequest(
            feature_sets=fs_request,
            dataset_source=DatasetSource(file_source=DatasetSource.FileSource(
                file_uris=staged_files,
                data_format=DataFormat.DATA_FORMAT_AVRO)),
        )

        # Retrieve Feast Job object to manage life cycle of retrieval
        response = self._serving_service_stub.GetBatchFeatures(request)
        return Job(response.job, self._serving_service_stub)

    def _validate_dataframe_for_batch_retrieval(self,
                                                entity_rows: pd.DataFrame,
                                                feature_sets_request):
        """
        Validate whether an the entity rows in a DataFrame contains the correct
        information for batch retrieval.

        Datetime column must be present in the DataFrame.

        Args:
            entity_rows (pd.DataFrame):
                Pandas DataFrame containing entities and datetime column. Each
                entity in a feature set must be present as a column in this
                DataFrame.

            feature_sets_request:
                Feature sets that will be requested.
        """

        self._validate_columns(columns=entity_rows.columns,
                               feature_sets_request=feature_sets_request,
                               datetime_field="datetime")

    def _validate_avro_for_batch_retrieval(self, source: str,
                                           feature_sets_request):
        """
        Validate whether the entity rows in an Avro source file contains the
        correct information for batch retrieval.

        Only gs:// and local files (file://) uri schemes are allowed.

        Avro file must have a column named "event_timestamp".

        No checks will be done if a GCS path is provided.

        Args:
            source (str):
                File path to Avro.

            feature_sets_request:
                Feature sets that will be requested.
        """
        p = urlparse(source)

        if p.scheme == "gs":
            # GCS path provided (Risk is delegated to user)
            # No validation if GCS path is provided
            return
        elif p.scheme == "file" or not p.scheme:
            # Local file (file://) provided
            file_path = os.path.abspath(os.path.join(p.netloc, p.path))
        else:
            raise Exception(
                f"Unsupported uri scheme provided {p.scheme}, only "
                f"local files (file://), and gs:// schemes are "
                f"allowed")

        with open(file_path, "rb") as f:
            reader = fastavro.reader(f)
            schema = json.loads(reader.metadata["avro.schema"])
            columns = [x["name"] for x in schema["fields"]]
            self._validate_columns(columns=columns,
                                   feature_sets_request=feature_sets_request,
                                   datetime_field="event_timestamp")

    def _validate_columns(self, columns: List[str], feature_sets_request,
                          datetime_field: str) -> None:
        """
        Check if the required column contains the correct values for batch
        retrieval.

        Args:
            columns (List[str]):
                List of columns to validate against feature_sets_request.

            feature_sets_request ():
                Feature sets that will be requested.

            datetime_field (str):
                Name of the datetime field that must be enforced and present as
                a column in the data source.

        Returns:
            None:
                None
        """
        # Ensure datetime column exists
        if datetime_field not in columns:
            raise ValueError(
                f'Entity rows does not contain "{datetime_field}" column in '
                f'columns {columns}')

        # Validate Avro columns based on feature set entities
        for feature_set in feature_sets_request:
            fs = self.get_feature_set(name=feature_set.name,
                                      version=feature_set.version)
            if fs is None:
                raise ValueError(
                    f'Feature set "{feature_set.name}:{feature_set.version}" '
                    f"could not be found")
            for entity_type in fs.entities:
                if entity_type.name not in columns:
                    raise ValueError(
                        f'Input does not contain entity'
                        f' "{entity_type.name}" column in columns "{columns}"')

    def get_online_features(
        self,
        feature_ids: List[str],
        entity_rows: List[GetOnlineFeaturesRequest.EntityRow],
    ) -> GetOnlineFeaturesResponse:
        """
        Retrieves the latest online feature data from Feast Serving

        Args:
            feature_ids: List of feature Ids in the following format
                [feature_set_name]:[version]:[feature_name]
                example:
                    ["feature_set_1:6:my_feature_1",
                    "feature_set_1:6:my_feature_2",]
            entity_rows: List of GetFeaturesRequest.EntityRow where each row
                contains entities. Timestamp should not be set for online
                retrieval. All entity types within a feature

        Returns:
            Returns a list of maps where each item in the list contains the
            latest feature values for the provided entities
        """

        self._connect_serving()

        return self._serving_service_stub.GetOnlineFeatures(
            GetOnlineFeaturesRequest(
                feature_sets=_build_feature_set_request(feature_ids),
                entity_rows=entity_rows,
            ))  # type: GetOnlineFeaturesResponse

    def ingest(self,
               feature_set: Union[str, FeatureSet],
               source: Union[pd.DataFrame, str],
               chunk_size: int = 10000,
               version: int = None,
               force_update: bool = False,
               max_workers: int = max(CPU_COUNT - 1, 1),
               disable_progress_bar: bool = False,
               timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT) -> None:
        """
        Loads feature data into Feast for a specific feature set.

        Args:
            feature_set (typing.Union[str, FeatureSet]):
                Feature set object or the string name of the feature set
                (without a version).

            source (typing.Union[pd.DataFrame, str]):
                Either a file path or Pandas Dataframe to ingest into Feast
                Files that are currently supported:
                    * parquet
                    * csv
                    * json

            chunk_size (int):
                Amount of rows to load and ingest at a time.

            version (int):
                Feature set version.

            force_update (bool):
                Automatically update feature set based on source data prior to
                ingesting. This will also register changes to Feast.

            max_workers (int):
                Number of worker processes to use to encode values.

            disable_progress_bar (bool):
                Disable printing of progress statistics.

            timeout (int):
                Timeout in seconds to wait for completion.

        Returns:
            None:
                None
        """

        if isinstance(feature_set, FeatureSet):
            name = feature_set.name
            if version is None:
                version = feature_set.version
        elif isinstance(feature_set, str):
            name = feature_set
        else:
            raise Exception(f"Feature set name must be provided")

        # Read table and get row count
        tmp_table_name = _read_table_from_source(source, chunk_size,
                                                 max_workers)

        pq_file = pq.ParquetFile(tmp_table_name)

        row_count = pq_file.metadata.num_rows

        # Update the feature set based on PyArrow table of first row group
        if force_update:
            feature_set.infer_fields_from_pa(table=pq_file.read_row_group(0),
                                             discard_unused_fields=True,
                                             replace_existing_features=True)
            self.apply(feature_set)
        current_time = time.time()

        print("Waiting for feature set to be ready for ingestion...")
        while True:
            if timeout is not None and time.time() - current_time >= timeout:
                raise TimeoutError(
                    "Timed out waiting for feature set to be ready")
            feature_set = self.get_feature_set(name, version)
            if (feature_set is not None
                    and feature_set.status == FeatureSetStatus.STATUS_READY):
                break
            time.sleep(3)

        if timeout is not None:
            timeout = timeout - int(time.time() - current_time)

        try:
            # Kafka configs
            brokers = feature_set.get_kafka_source_brokers()
            topic = feature_set.get_kafka_source_topic()
            producer = get_producer(brokers, row_count, disable_progress_bar)

            # Loop optimization declarations
            produce = producer.produce
            flush = producer.flush

            # Transform and push data to Kafka
            if feature_set.source.source_type == "Kafka":
                for chunk in get_feature_row_chunks(
                        file=tmp_table_name,
                        row_groups=list(range(pq_file.num_row_groups)),
                        fs=feature_set,
                        max_workers=max_workers):

                    # Push FeatureRow one chunk at a time to kafka
                    for serialized_row in chunk:
                        produce(topic=topic, value=serialized_row)

                    # Force a flush after each chunk
                    flush(timeout=timeout)

                    # Remove chunk from memory
                    del chunk

            else:
                raise Exception(
                    f"Could not determine source type for feature set "
                    f'"{feature_set.name}" with source type '
                    f'"{feature_set.source.source_type}"')

            # Print ingestion statistics
            producer.print_results()
        finally:
            # Remove parquet file(s) that were created earlier
            print("Removing temporary file(s)...")
            os.remove(tmp_table_name)

        return None
Beispiel #3
0
class Client:
    def __init__(self,
                 core_url: str = None,
                 serving_url: str = None,
                 verbose: bool = False):
        self._core_url = core_url
        self._serving_url = serving_url
        self._verbose = verbose
        self.__core_channel: grpc.Channel = None
        self.__serving_channel: grpc.Channel = None
        self._core_service_stub: CoreServiceStub = None
        self._serving_service_stub: ServingServiceStub = None

    @property
    def core_url(self) -> str:
        if self._core_url is not None:
            return self._core_url
        if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None:
            return os.getenv(FEAST_CORE_URL_ENV_KEY)
        return ""

    @core_url.setter
    def core_url(self, value: str):
        self._core_url = value

    @property
    def serving_url(self) -> str:
        if self._serving_url is not None:
            return self._serving_url
        if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None:
            return os.getenv(FEAST_SERVING_URL_ENV_KEY)
        return ""

    @serving_url.setter
    def serving_url(self, value: str):
        self._serving_url = value

    def version(self):
        """
        Returns version information from Feast Core and Feast Serving
        :return: Dictionary containing Core and Serving versions and status
        """

        self._connect_core()
        self._connect_serving()

        core_version = ""
        serving_version = ""
        core_status = "not connected"
        serving_status = "not connected"

        try:
            core_version = self._core_service_stub.GetFeastCoreVersion(
                GetFeastCoreVersionRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version
            core_status = "connected"
        except grpc.RpcError as e:
            print(
                format_grpc_exception("GetFeastCoreVersion", e.code(),
                                      e.details()))

        try:
            serving_version = self._serving_service_stub.GetFeastServingInfo(
                GetFeastServingInfoRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version
            serving_status = "connected"
        except grpc.RpcError as e:
            print(
                format_grpc_exception("GetFeastServingInfo", e.code(),
                                      e.details()))

        return {
            "core": {
                "url": self.core_url,
                "version": core_version,
                "status": core_status,
            },
            "serving": {
                "url": self.serving_url,
                "version": serving_version,
                "status": serving_status,
            },
        }

    def _connect_core(self, skip_if_connected=True):
        """
        Connect to Core API
        """
        if skip_if_connected and self._core_service_stub:
            return

        if not self.core_url:
            raise ValueError("Please set Feast Core URL.")

        if self.__core_channel is None:
            self.__core_channel = grpc.insecure_channel(self.core_url)

        try:
            grpc.channel_ready_future(self.__core_channel).result(
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT)
        except grpc.FutureTimeoutError:
            print(
                f"Connection timed out while attempting to connect to Feast Core gRPC server {self.core_url}"
            )
            sys.exit(1)
        else:
            self._core_service_stub = CoreServiceStub(self.__core_channel)

    def _connect_serving(self, skip_if_connected=True):
        """
        Connect to Serving API
        """

        if skip_if_connected and self._serving_service_stub:
            return

        if not self.serving_url:
            raise ValueError("Please set Feast Serving URL.")

        if self.__serving_channel is None:
            self.__serving_channel = grpc.insecure_channel(self.serving_url)

        try:
            grpc.channel_ready_future(self.__serving_channel).result(
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT)
        except grpc.FutureTimeoutError:
            print(
                f"Connection timed out while attempting to connect to Feast Serving gRPC server {self.serving_url} "
            )
            sys.exit(1)
        else:
            self._serving_service_stub = ServingServiceStub(
                self.__serving_channel)

    def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]):
        """
        Idempotently registers feature set(s) with Feast Core. Either a single feature set or a list can be provided.
        :param feature_sets: Union[List[FeatureSet], FeatureSet]
        """
        if not isinstance(feature_sets, list):
            feature_sets = [feature_sets]
        for feature_set in feature_sets:
            if isinstance(feature_set, FeatureSet):
                self._apply_feature_set(feature_set)
                continue
            raise ValueError(
                f"Could not determine feature set type to apply {feature_set}")

    def _apply_feature_set(self, feature_set: FeatureSet):
        self._connect_core()
        feature_set._client = self

        valid, message = feature_set.is_valid()
        if not valid:
            raise Exception(message)
        try:
            apply_fs_response = self._core_service_stub.ApplyFeatureSet(
                ApplyFeatureSetRequest(feature_set=feature_set.to_proto()),
                timeout=GRPC_CONNECTION_TIMEOUT_APPLY,
            )  # type: ApplyFeatureSetResponse
            applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set)

            if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED:
                print(
                    f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}".'
                )
                feature_set._update_from_feature_set(applied_fs,
                                                     is_dirty=False)
                return
            if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE:
                print(f"No change detected in feature set {feature_set.name}")
                return
        except grpc.RpcError as e:
            print(
                format_grpc_exception("ApplyFeatureSet", e.code(),
                                      e.details()))

    def list_feature_sets(self) -> List[FeatureSet]:
        """
        Retrieve a list of feature sets from Feast Core
        :return: Returns a list of feature sets
        """
        self._connect_core()

        try:
            # Get latest feature sets from Feast Core
            feature_set_protos = self._core_service_stub.ListFeatureSets(
                ListFeatureSetsRequest())  # type: ListFeatureSetsResponse
        except grpc.RpcError as e:
            raise Exception(
                format_grpc_exception("ListFeatureSets", e.code(),
                                      e.details()))

        # Store list of feature sets
        feature_sets = []
        for feature_set_proto in feature_set_protos.feature_sets:
            feature_set = FeatureSet.from_proto(feature_set_proto)
            feature_set._client = self
            feature_sets.append(feature_set)
        return feature_sets

    def get_feature_set(
            self,
            name: str,
            version: int = None,
            fail_if_missing: bool = False) -> Union[FeatureSet, None]:
        """
        Retrieve a single feature set from Feast Core
        :param name: (str) Name of feature set
        :param version: (int) Version of feature set
        :param fail_if_missing: (bool) Throws an exception if the feature set is not
         found
        :return: Returns a single feature set

        """
        self._connect_core()
        try:
            get_feature_set_response = self._core_service_stub.GetFeatureSet(
                GetFeatureSetRequest(
                    name=name.strip(),
                    version=str(version)))  # type: GetFeatureSetResponse
            feature_set = get_feature_set_response.feature_set
        except grpc.RpcError as e:
            print(format_grpc_exception("GetFeatureSet", e.code(),
                                        e.details()))
        else:
            if feature_set is not None:
                return FeatureSet.from_proto(feature_set)

            if fail_if_missing:
                raise Exception(
                    f'Could not find feature set with name "{name}" and '
                    f'version "{version}"')

    def list_entities(self) -> Dict[str, Entity]:
        """
        Returns a dictionary of entities across all feature sets
        :return: Dictionary of entity name to Entity
        """
        entities_dict = OrderedDict()
        for fs in self.list_feature_sets():
            for entity in fs.entities:
                entities_dict[entity.name] = entity
        return entities_dict

    def get_batch_features(self, feature_ids: List[str],
                           entity_rows: pd.DataFrame) -> Job:
        """
        Retrieves historical features from a Feast Serving deployment.

        Args:
            feature_ids: List of feature ids that will be returned for each entity.
            Each feature id should have the following format "feature_set_name:version:feature_name".

            entity_rows: Pandas dataframe containing entities and a 'datetime' column. Each entity in
            a feature set must be present as a column in this dataframe. The datetime column must
            contain timestamps in datetime64 format

        Returns:
            Feast batch retrieval job: feast.job.Job
            
        Example usage:
        ============================================================
        >>> from feast import Client
        >>> from datetime import datetime
        >>>
        >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
        >>> feature_ids = ["customer:1:bookings_7d"]
        >>> entity_rows = pd.DataFrame(
        >>>         {
        >>>            "datetime": [pd.datetime.now() for _ in range(3)],
        >>>            "customer": [1001, 1002, 1003],
        >>>         }
        >>>     )
        >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows)
        >>> df = feature_retrieval_job.to_dataframe()
        >>> print(df)
        """

        self._connect_serving()

        try:
            fs_request = _build_feature_set_request(feature_ids)

            # Validate entity rows based on entities in Feast Core
            self._validate_entity_rows_for_batch_retrieval(
                entity_rows, fs_request)

            # We want the timestamp column naming to be consistent with the
            # rest of Feast
            entity_rows.columns = [
                "event_timestamp" if col == "datetime" else col
                for col in entity_rows.columns
            ]

            # Remove timezone from datetime column
            if isinstance(
                    entity_rows["event_timestamp"].dtype,
                    pd.core.dtypes.dtypes.DatetimeTZDtype,
            ):
                entity_rows["event_timestamp"] = pd.DatetimeIndex(
                    entity_rows["event_timestamp"]).tz_localize(None)

            # Retrieve serving information to determine store type and staging location
            serving_info = self._serving_service_stub.GetFeastServingInfo(
                GetFeastServingInfoRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
            )  # type: GetFeastServingInfoResponse

            if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH:
                raise Exception(
                    f'You are connected to a store "{self._serving_url}" which does not support batch retrieval'
                )

            # Export and upload entity row dataframe to staging location provided by Feast
            staged_file = export_dataframe_to_staging_location(
                entity_rows, serving_info.job_staging_location)  # type: str

            request = GetBatchFeaturesRequest(
                feature_sets=fs_request,
                dataset_source=DatasetSource(
                    file_source=DatasetSource.FileSource(
                        file_uris=[staged_file],
                        data_format=DataFormat.DATA_FORMAT_AVRO)),
            )

            # Retrieve Feast Job object to manage life cycle of retrieval
            response = self._serving_service_stub.GetBatchFeatures(request)
            return Job(response.job, self._serving_service_stub)

        except grpc.RpcError as e:
            print(
                format_grpc_exception("GetBatchFeatures", e.code(),
                                      e.details()))

    def _validate_entity_rows_for_batch_retrieval(self, entity_rows,
                                                  feature_sets_request):
        """
        Validate whether an entity_row dataframe contains the correct information for batch retrieval
        :param entity_rows: Pandas dataframe containing entities and datetime column. Each entity in a feature set
        must be present as a column in this dataframe.
        :param feature_sets_request: Feature sets that will
        """

        # Ensure datetime column exists
        if "datetime" not in entity_rows.columns:
            raise ValueError(
                f'Entity rows does not contain "datetime" column in columns {entity_rows.columns}'
            )

        # Validate dataframe columns based on feature set entities
        for feature_set in feature_sets_request:
            fs = self.get_feature_set(name=feature_set.name,
                                      version=feature_set.version)
            if fs is None:
                raise ValueError(
                    f'Feature set "{feature_set.name}:{feature_set.version}" could not be found'
                )
            for entity_type in fs.entities:
                if entity_type.name not in entity_rows.columns:
                    raise ValueError(
                        f'Dataframe does not contain entity "{entity_type.name}" column in columns "{entity_rows.columns}"'
                    )

    def get_online_features(
        self,
        feature_ids: List[str],
        entity_rows: List[GetOnlineFeaturesRequest.EntityRow],
    ) -> GetOnlineFeaturesResponse:
        """
        Retrieves the latest online feature data from Feast Serving
        :param feature_ids: List of feature Ids in the following format
                            [feature_set_name]:[version]:[feature_name]
                            example: ["feature_set_1:6:my_feature_1",
                                     "feature_set_1:6:my_feature_2",]

        :param entity_rows: List of GetFeaturesRequest.EntityRow where each row
                            contains entities. Timestamp should not be set for
                            online retrieval. All entity types within a feature
                            set must be provided for each entity key.
        :return: Returns a list of maps where each item in the list contains
                 the latest feature values for the provided entities
        """
        self._connect_serving()

        try:
            response = self._serving_service_stub.GetOnlineFeatures(
                GetOnlineFeaturesRequest(
                    feature_sets=_build_feature_set_request(feature_ids),
                    entity_rows=entity_rows,
                ))  # type: GetOnlineFeaturesResponse
        except grpc.RpcError as e:
            print(
                format_grpc_exception("GetOnlineFeatures", e.code(),
                                      e.details()))
        else:
            return response

    def ingest(
        self,
        feature_set: Union[str, FeatureSet],
        dataframe: pd.DataFrame,
        version: int = None,
        force_update: bool = False,
        max_workers: int = CPU_COUNT,
        disable_progress_bar: bool = False,
        chunk_size: int = 5000,
    ):
        """
        Loads data into Feast for a specific feature set.

        :param feature_set: (str, FeatureSet) Feature set object or the
        string name of the feature set (without a version)
        :param dataframe:
        Pandas dataframe to load into Feast for this feature set
        :param
        version: (int) Version of the feature set for which this ingestion
        should happen
        :param force_update: (bool) Automatically update
        feature set based on data frame before ingesting data
        :param max_workers: Number of
        worker processes to use to encode the dataframe
        :param
        disable_progress_bar: Disable progress bar during ingestion
        :param
        chunk_size: Number of rows per chunk to encode before ingesting to
        Feast
        """
        if isinstance(feature_set, FeatureSet):
            name = feature_set.name
            if version is None:
                version = feature_set.version
        elif isinstance(feature_set, str):
            name = feature_set
        else:
            raise Exception(f"Feature set name must be provided")

        feature_set = self.get_feature_set(name, version, fail_if_missing=True)

        # Update the feature set based on dataframe schema
        if force_update:
            feature_set.infer_fields_from_df(dataframe,
                                             discard_unused_fields=True,
                                             replace_existing_features=True)
            self.apply(feature_set)

        if feature_set.source.source_type == "Kafka":
            ingest_kafka(
                feature_set=feature_set,
                dataframe=dataframe,
                max_workers=max_workers,
                disable_progress_bar=disable_progress_bar,
                chunk_size=chunk_size,
            )
        else:
            raise Exception(f"Could not determine source type for feature set "
                            f'"{feature_set.name}" with source type '
                            f'"{feature_set.source.source_type}"')
Beispiel #4
0
class Client:
    """
    Feast Client: Used for creating, managing, and retrieving features.
    """
    def __init__(self,
                 core_url: str = None,
                 serving_url: str = None,
                 verbose: bool = False):
        """
        The Feast Client should be initialized with at least one service url

        Args:
            core_url: Feast Core URL. Used to manage features
            serving_url: Feast Serving URL. Used to retrieve features
            verbose: Enable verbose logging
        """
        self._core_url = core_url
        self._serving_url = serving_url
        self._verbose = verbose
        self.__core_channel: grpc.Channel = None
        self.__serving_channel: grpc.Channel = None
        self._core_service_stub: CoreServiceStub = None
        self._serving_service_stub: ServingServiceStub = None

    @property
    def core_url(self) -> str:
        """
        Retrieve Feast Core URL
        """

        if self._core_url is not None:
            return self._core_url
        if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None:
            return os.getenv(FEAST_CORE_URL_ENV_KEY)
        return ""

    @core_url.setter
    def core_url(self, value: str):
        """
        Set the Feast Core URL

        Returns:
            Feast Core URL string
        """
        self._core_url = value

    @property
    def serving_url(self) -> str:
        """
        Retrieve Serving Core URL
        """
        if self._serving_url is not None:
            return self._serving_url
        if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None:
            return os.getenv(FEAST_SERVING_URL_ENV_KEY)
        return ""

    @serving_url.setter
    def serving_url(self, value: str):
        """
        Set the Feast Serving URL

        Returns:
            Feast Serving URL string
        """
        self._serving_url = value

    def version(self):
        """
        Returns version information from Feast Core and Feast Serving
        """
        result = {}

        if self.serving_url:
            self._connect_serving()
            serving_version = self._serving_service_stub.GetFeastServingInfo(
                GetFeastServingInfoRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version
            result["serving"] = {
                "url": self.serving_url,
                "version": serving_version
            }

        if self.core_url:
            self._connect_core()
            core_version = self._core_service_stub.GetFeastCoreVersion(
                GetFeastCoreVersionRequest(),
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT).version
            result["core"] = {"url": self.core_url, "version": core_version}

        return result

    def _connect_core(self, skip_if_connected: bool = True):
        """
        Connect to Core API

        Args:
            skip_if_connected: Do not attempt to connect if already connected
        """
        if skip_if_connected and self._core_service_stub:
            return

        if not self.core_url:
            raise ValueError("Please set Feast Core URL.")

        if self.__core_channel is None:
            self.__core_channel = grpc.insecure_channel(self.core_url)

        try:
            grpc.channel_ready_future(self.__core_channel).result(
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT)
        except grpc.FutureTimeoutError:
            raise ConnectionError(
                f"Connection timed out while attempting to connect to Feast "
                f"Core gRPC server {self.core_url} ")
        else:
            self._core_service_stub = CoreServiceStub(self.__core_channel)

    def _connect_serving(self, skip_if_connected=True):
        """
        Connect to Serving API

        Args:
            skip_if_connected: Do not attempt to connect if already connected
        """

        if skip_if_connected and self._serving_service_stub:
            return

        if not self.serving_url:
            raise ValueError("Please set Feast Serving URL.")

        if self.__serving_channel is None:
            self.__serving_channel = grpc.insecure_channel(self.serving_url)

        try:
            grpc.channel_ready_future(self.__serving_channel).result(
                timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT)
        except grpc.FutureTimeoutError:
            raise ConnectionError(
                f"Connection timed out while attempting to connect to Feast "
                f"Serving gRPC server {self.serving_url} ")
        else:
            self._serving_service_stub = ServingServiceStub(
                self.__serving_channel)

    def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]):
        """
        Idempotently registers feature set(s) with Feast Core. Either a single
        feature set or a list can be provided.

        Args:
            feature_sets: List of feature sets that will be registered
        """
        if not isinstance(feature_sets, list):
            feature_sets = [feature_sets]
        for feature_set in feature_sets:
            if isinstance(feature_set, FeatureSet):
                self._apply_feature_set(feature_set)
                continue
            raise ValueError(
                f"Could not determine feature set type to apply {feature_set}")

    def _apply_feature_set(self, feature_set: FeatureSet):
        """
        Registers a single feature set with Feast

        Args:
            feature_set: Feature set that will be registered
        """
        self._connect_core()
        feature_set._client = self

        feature_set.is_valid()

        # Convert the feature set to a request and send to Feast Core
        apply_fs_response = self._core_service_stub.ApplyFeatureSet(
            ApplyFeatureSetRequest(feature_set=feature_set.to_proto()),
            timeout=GRPC_CONNECTION_TIMEOUT_APPLY,
        )  # type: ApplyFeatureSetResponse

        # Extract the returned feature set
        applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set)

        # If the feature set has changed, update the local copy
        if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED:
            print(
                f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}"'
            )

        # If no change has been applied, do nothing
        if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE:
            print(f"No change detected or applied: {feature_set.name}")

        # Deep copy from the returned feature set to the local feature set
        feature_set.update_from_feature_set(applied_fs)

    def list_feature_sets(self) -> List[FeatureSet]:
        """
        Retrieve a list of feature sets from Feast Core

        Returns:
            List of feature sets
        """
        self._connect_core()

        # Get latest feature sets from Feast Core
        feature_set_protos = self._core_service_stub.ListFeatureSets(
            ListFeatureSetsRequest())  # type: ListFeatureSetsResponse

        # Extract feature sets and return
        feature_sets = []
        for feature_set_proto in feature_set_protos.feature_sets:
            feature_set = FeatureSet.from_proto(feature_set_proto)
            feature_set._client = self
            feature_sets.append(feature_set)
        return feature_sets

    def get_feature_set(self,
                        name: str,
                        version: int = None) -> Union[FeatureSet, None]:
        """
        Retrieves a feature set. If no version is specified then the latest
        version will be returned.

        Args:
            name: Name of feature set
            version: Version of feature set

        Returns:
            Returns either the specified feature set, or raises an exception if
            none is found
        """
        self._connect_core()

        if version is None:
            version = 0
        get_feature_set_response = self._core_service_stub.GetFeatureSet(
            GetFeatureSetRequest(
                name=name.strip(),
                version=int(version)))  # type: GetFeatureSetResponse
        return FeatureSet.from_proto(get_feature_set_response.feature_set)

    def list_entities(self) -> Dict[str, Entity]:
        """
        Returns a dictionary of entities across all feature sets

        Returns:
            Dictionary of entities, indexed by name
        """
        entities_dict = OrderedDict()
        for fs in self.list_feature_sets():
            for entity in fs.entities:
                entities_dict[entity.name] = entity
        return entities_dict

    def get_batch_features(self, feature_ids: List[str],
                           entity_rows: pd.DataFrame) -> Job:
        """
        Retrieves historical features from a Feast Serving deployment.

        Args:
            feature_ids: List of feature ids that will be returned for each
                entity. Each feature id should have the following format
                "feature_set_name:version:feature_name".
            entity_rows: Pandas dataframe containing entities and a 'datetime'
                column. Each entity in a feature set must be present as a column
                in this dataframe. The datetime column must

        Returns:
            Returns a job object that can be used to monitor retrieval progress
            asynchronously, and can be used to materialize the results

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>>
            >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
            >>> feature_ids = ["customer:1:bookings_7d"]
            >>> entity_rows = pd.DataFrame(
            >>>         {
            >>>            "datetime": [pd.datetime.now() for _ in range(3)],
            >>>            "customer": [1001, 1002, 1003],
            >>>         }
            >>>     )
            >>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows)
            >>> df = feature_retrieval_job.to_dataframe()
            >>> print(df)
        """

        self._connect_serving()

        fs_request = _build_feature_set_request(feature_ids)

        # Validate entity rows based on entities in Feast Core
        self._validate_entity_rows_for_batch_retrieval(entity_rows, fs_request)

        # Remove timezone from datetime column
        if isinstance(entity_rows["datetime"].dtype,
                      pd.core.dtypes.dtypes.DatetimeTZDtype):
            entity_rows["datetime"] = pd.DatetimeIndex(
                entity_rows["datetime"]).tz_localize(None)

        # Retrieve serving information to determine store type and
        # staging location
        serving_info = self._serving_service_stub.GetFeastServingInfo(
            GetFeastServingInfoRequest(),
            timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
        )  # type: GetFeastServingInfoResponse

        if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH:
            raise Exception(
                f'You are connected to a store "{self._serving_url}" which '
                f"does not support batch retrieval ")

        # Export and upload entity row dataframe to staging location
        # provided by Feast
        staged_file = export_dataframe_to_staging_location(
            entity_rows, serving_info.job_staging_location)  # type: str

        request = GetBatchFeaturesRequest(
            feature_sets=fs_request,
            dataset_source=DatasetSource(file_source=DatasetSource.FileSource(
                file_uris=[staged_file],
                data_format=DataFormat.DATA_FORMAT_AVRO)),
        )

        # Retrieve Feast Job object to manage life cycle of retrieval
        response = self._serving_service_stub.GetBatchFeatures(request)
        return Job(response.job, self._serving_service_stub)

    def _validate_entity_rows_for_batch_retrieval(self, entity_rows,
                                                  feature_sets_request):
        """
        Validate whether an entity_row dataframe contains the correct
        information for batch retrieval

        Args:
            entity_rows: Pandas dataframe containing entities and datetime
                column. Each entity in a feature set must be present as a
                column in this dataframe.
            feature_sets_request: Feature sets that will be requested
        """

        # Ensure datetime column exists
        if "datetime" not in entity_rows.columns:
            raise ValueError(
                f'Entity rows does not contain "datetime" column in columns '
                f"{entity_rows.columns}")

        # Validate dataframe columns based on feature set entities
        for feature_set in feature_sets_request:
            fs = self.get_feature_set(name=feature_set.name,
                                      version=feature_set.version)
            if fs is None:
                raise ValueError(
                    f'Feature set "{feature_set.name}:{feature_set.version}" '
                    f"could not be found")
            for entity_type in fs.entities:
                if entity_type.name not in entity_rows.columns:
                    raise ValueError(
                        f'Dataframe does not contain entity "{entity_type.name}"'
                        f' column in columns "{entity_rows.columns}"')

    def get_online_features(
        self,
        feature_ids: List[str],
        entity_rows: List[GetOnlineFeaturesRequest.EntityRow],
    ) -> GetOnlineFeaturesResponse:
        """
        Retrieves the latest online feature data from Feast Serving

        Args:
            feature_ids: List of feature Ids in the following format
                [feature_set_name]:[version]:[feature_name]
                example:
                    ["feature_set_1:6:my_feature_1",
                    "feature_set_1:6:my_feature_2",]
            entity_rows: List of GetFeaturesRequest.EntityRow where each row
                contains entities. Timestamp should not be set for online
                retrieval. All entity types within a feature

        Returns:
            Returns a list of maps where each item in the list contains the
            latest feature values for the provided entities
        """

        self._connect_serving()

        return self._serving_service_stub.GetOnlineFeatures(
            GetOnlineFeaturesRequest(
                feature_sets=_build_feature_set_request(feature_ids),
                entity_rows=entity_rows,
            ))  # type: GetOnlineFeaturesResponse

    def ingest(
        self,
        feature_set: Union[str, FeatureSet],
        source: Union[pd.DataFrame, str],
        version: int = None,
        force_update: bool = False,
        max_workers: int = CPU_COUNT,
        disable_progress_bar: bool = False,
        chunk_size: int = 5000,
        timeout: int = None,
    ):
        """
        Loads feature data into Feast for a specific feature set.

        Args:
            feature_set: Name of feature set or a feature set object
            source: Either a file path or Pandas Dataframe to ingest into Feast
                Files that are currently supported:
                * parquet
                * csv
                * json
            version: Feature set version
            force_update: Automatically update feature set based on source data
                prior to ingesting. This will also register changes to Feast
            max_workers: Number of worker processes to use to encode values
            disable_progress_bar: Disable printing of progress statistics
            chunk_size: Maximum amount of rows to load into memory and ingest at
                a time
            timeout: Seconds to wait before ingestion times out
        """
        if isinstance(feature_set, FeatureSet):
            name = feature_set.name
            if version is None:
                version = feature_set.version
        elif isinstance(feature_set, str):
            name = feature_set
        else:
            raise Exception(f"Feature set name must be provided")

        table = _read_table_from_source(source)

        # Update the feature set based on DataFrame schema
        if force_update:
            # Use a small as reference DataFrame to infer fields
            ref_df = table.to_batches(max_chunksize=20)[0].to_pandas()

            feature_set.infer_fields_from_df(ref_df,
                                             discard_unused_fields=True,
                                             replace_existing_features=True)
            self.apply(feature_set)

        feature_set = self.get_feature_set(name, version)

        if feature_set.source.source_type == "Kafka":
            ingest_table_to_kafka(
                feature_set=feature_set,
                table=table,
                max_workers=max_workers,
                disable_pbar=disable_progress_bar,
                chunk_size=chunk_size,
                timeout=timeout,
            )
        else:
            raise Exception(f"Could not determine source type for feature set "
                            f'"{feature_set.name}" with source type '
                            f'"{feature_set.source.source_type}"')