Exemple #1
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[FileSource, BigQuerySource],
        project: str = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[FileSource, BigQuerySource]): Source for the entity rows.
                The user needs to make sure that the source is accessible from the Spark cluster
                that will be used for the retrieval job.
            project: Specifies the project that contains the feature tables
                which the requested features belong to.

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source, project="my_project")
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, project)
        output_location = self._config.get(
            CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION)
        output_format = self._config.get(
            CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT)
        job_id = f"historical-feature-{str(uuid.uuid4())}"

        return start_historical_feature_retrieval_job(self, entity_source,
                                                      feature_tables,
                                                      output_format,
                                                      output_location, job_id)
Exemple #2
0
    def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest,
                              context):
        """Produce a training dataset, return a job id that will provide a file reference"""
        job = start_historical_feature_retrieval_job(
            client=self.client,
            project=request.project,
            entity_source=DataSource.from_proto(request.entity_source),
            feature_tables=self.client._get_feature_tables_from_feature_refs(
                list(request.feature_refs), request.project),
            output_format=request.output_format,
            output_path=request.output_location,
        )

        output_file_uri = job.get_output_file_uri(block=False)

        return GetHistoricalFeaturesResponse(id=job.get_id(),
                                             output_file_uri=output_file_uri)
Exemple #3
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        output_location: Optional[str] = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be staged
                to become accessible by spark workers.
                If one of feature tables' source is in BigQuery - entities will be upload to BQ.
                Otherwise to remote file storage (derived from configured staging location).
                It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            destination_path: Specifies the path in a bucket to write the exported feature data files

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from feast.data_format import ParquetFormat
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", ParquetFormat(), "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source)
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, self.project)

        assert all(
            ft.batch_source.created_timestamp_column
            for ft in feature_tables), (
                "All BatchSources attached to retrieved FeatureTables "
                "must have specified `created_timestamp_column` to be used in "
                "historical dataset generation.")

        if output_location is None:
            output_location = os.path.join(
                self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_LOCATION),
                str(uuid.uuid4()),
            )
        output_format = self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_FORMAT)
        feature_sources = [
            feature_table.batch_source for feature_table in feature_tables
        ]

        if isinstance(entity_source, pd.DataFrame):
            if any(
                    isinstance(source, BigQuerySource)
                    for source in feature_sources):
                first_bq_source = [
                    source for source in feature_sources
                    if isinstance(source, BigQuerySource)
                ][0]
                source_ref = table_reference_from_string(
                    first_bq_source.bigquery_options.table_ref)
                entity_source = stage_entities_to_bq(entity_source,
                                                     source_ref.project,
                                                     source_ref.dataset_id)
            else:
                entity_source = stage_entities_to_fs(
                    entity_source,
                    staging_location=self._config.get(
                        opt.SPARK_STAGING_LOCATION),
                    config=self._config,
                )

        if self._use_job_service:
            response = self._job_service.GetHistoricalFeatures(
                GetHistoricalFeaturesRequest(
                    feature_refs=feature_refs,
                    entity_source=entity_source.to_proto(),
                    project=self.project,
                    output_format=output_format,
                    output_location=output_location,
                ),
                **self._extra_grpc_params(),
            )
            return RemoteRetrievalJob(
                self._job_service,
                self._extra_grpc_params,
                response.id,
                output_file_uri=response.output_file_uri,
            )
        else:
            return start_historical_feature_retrieval_job(
                client=self,
                project=self.project,
                entity_source=entity_source,
                feature_tables=feature_tables,
                output_format=output_format,
                output_path=output_location,
            )
Exemple #4
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        project: str = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be exported to the staging
                location as parquet file. It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            project: Specifies the project that contains the feature tables
                which the requested features belong to.

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source, project="my_project")
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, project)
        output_location = os.path.join(
            self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION),
            str(uuid.uuid4()),
        )
        output_format = self._config.get(
            CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT)

        if isinstance(entity_source, pd.DataFrame):
            staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION)
            entity_staging_uri = urlparse(
                os.path.join(staging_location, str(uuid.uuid4())))
            staging_client = get_staging_client(entity_staging_uri.scheme)
            with tempfile.NamedTemporaryFile() as df_export_path:
                entity_source.to_parquet(df_export_path.name)
                bucket = (None if entity_staging_uri.scheme == "file" else
                          entity_staging_uri.netloc)
                staging_client.upload_file(df_export_path.name, bucket,
                                           entity_staging_uri.path.lstrip("/"))
                entity_source = FileSource(
                    "event_timestamp",
                    "created_timestamp",
                    ParquetFormat(),
                    entity_staging_uri.geturl(),
                )

        return start_historical_feature_retrieval_job(
            self,
            entity_source,
            feature_tables,
            output_format,
            os.path.join(output_location, str(uuid.uuid4())),
        )