def get_historical_features( self, feature_refs: List[str], entity_source: Union[FileSource, BigQuerySource], project: str = None, ) -> RetrievalJob: """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[FileSource, BigQuerySource]): Source for the entity rows. The user needs to make sure that the source is accessible from the Spark cluster that will be used for the retrieval job. project: Specifies the project that contains the feature tables which the requested features belong to. Returns: Returns a retrieval job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer") >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_source, project="my_project") >>> output_file_uri = feature_retrieval_job.get_output_file_uri() "gs://some-bucket/output/ """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, project) output_location = self._config.get( CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION) output_format = self._config.get( CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT) job_id = f"historical-feature-{str(uuid.uuid4())}" return start_historical_feature_retrieval_job(self, entity_source, feature_tables, output_format, output_location, job_id)
def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest, context): """Produce a training dataset, return a job id that will provide a file reference""" job = start_historical_feature_retrieval_job( client=self.client, project=request.project, entity_source=DataSource.from_proto(request.entity_source), feature_tables=self.client._get_feature_tables_from_feature_refs( list(request.feature_refs), request.project), output_format=request.output_format, output_path=request.output_location, ) output_file_uri = job.get_output_file_uri(block=False) return GetHistoricalFeaturesResponse(id=job.get_id(), output_file_uri=output_file_uri)
def get_historical_features( self, feature_refs: List[str], entity_source: Union[pd.DataFrame, FileSource, BigQuerySource], output_location: Optional[str] = None, ) -> RetrievalJob: """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows. If entity_source is a Panda DataFrame, the dataframe will be staged to become accessible by spark workers. If one of feature tables' source is in BigQuery - entities will be upload to BQ. Otherwise to remote file storage (derived from configured staging location). It is also assumed that the column event_timestamp is present in the dataframe, and is of type datetime without timezone information. The user needs to make sure that the source (or staging location, if entity_source is a Panda DataFrame) is accessible from the Spark cluster that will be used for the retrieval job. destination_path: Specifies the path in a bucket to write the exported feature data files Returns: Returns a retrieval job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from feast.data_format import ParquetFormat >>> from datetime import datetime >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", ParquetFormat(), "gs://some-bucket/customer") >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_source) >>> output_file_uri = feature_retrieval_job.get_output_file_uri() "gs://some-bucket/output/ """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, self.project) assert all( ft.batch_source.created_timestamp_column for ft in feature_tables), ( "All BatchSources attached to retrieved FeatureTables " "must have specified `created_timestamp_column` to be used in " "historical dataset generation.") if output_location is None: output_location = os.path.join( self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_LOCATION), str(uuid.uuid4()), ) output_format = self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_FORMAT) feature_sources = [ feature_table.batch_source for feature_table in feature_tables ] if isinstance(entity_source, pd.DataFrame): if any( isinstance(source, BigQuerySource) for source in feature_sources): first_bq_source = [ source for source in feature_sources if isinstance(source, BigQuerySource) ][0] source_ref = table_reference_from_string( first_bq_source.bigquery_options.table_ref) entity_source = stage_entities_to_bq(entity_source, source_ref.project, source_ref.dataset_id) else: entity_source = stage_entities_to_fs( entity_source, staging_location=self._config.get( opt.SPARK_STAGING_LOCATION), config=self._config, ) if self._use_job_service: response = self._job_service.GetHistoricalFeatures( GetHistoricalFeaturesRequest( feature_refs=feature_refs, entity_source=entity_source.to_proto(), project=self.project, output_format=output_format, output_location=output_location, ), **self._extra_grpc_params(), ) return RemoteRetrievalJob( self._job_service, self._extra_grpc_params, response.id, output_file_uri=response.output_file_uri, ) else: return start_historical_feature_retrieval_job( client=self, project=self.project, entity_source=entity_source, feature_tables=feature_tables, output_format=output_format, output_path=output_location, )
def get_historical_features( self, feature_refs: List[str], entity_source: Union[pd.DataFrame, FileSource, BigQuerySource], project: str = None, ) -> RetrievalJob: """ Launch a historical feature retrieval job. Args: feature_refs: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows. If entity_source is a Panda DataFrame, the dataframe will be exported to the staging location as parquet file. It is also assumed that the column event_timestamp is present in the dataframe, and is of type datetime without timezone information. The user needs to make sure that the source (or staging location, if entity_source is a Panda DataFrame) is accessible from the Spark cluster that will be used for the retrieval job. project: Specifies the project that contains the feature tables which the requested features belong to. Returns: Returns a retrieval job object that can be used to monitor retrieval progress asynchronously, and can be used to materialize the results. Examples: >>> from feast import Client >>> from datetime import datetime >>> feast_client = Client(core_url="localhost:6565") >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"] >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer") >>> feature_retrieval_job = feast_client.get_historical_features( >>> feature_refs, entity_source, project="my_project") >>> output_file_uri = feature_retrieval_job.get_output_file_uri() "gs://some-bucket/output/ """ feature_tables = self._get_feature_tables_from_feature_refs( feature_refs, project) output_location = os.path.join( self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION), str(uuid.uuid4()), ) output_format = self._config.get( CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT) if isinstance(entity_source, pd.DataFrame): staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION) entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme) with tempfile.NamedTemporaryFile() as df_export_path: entity_source.to_parquet(df_export_path.name) bucket = (None if entity_staging_uri.scheme == "file" else entity_staging_uri.netloc) staging_client.upload_file(df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")) entity_source = FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), entity_staging_uri.geturl(), ) return start_historical_feature_retrieval_job( self, entity_source, feature_tables, output_format, os.path.join(output_location, str(uuid.uuid4())), )