def from_proto(cls, feature_table_proto: FeatureTableProto): """ Creates a feature table from a protobuf representation of a feature table Args: feature_table_proto: A protobuf representation of a feature table Returns: Returns a FeatureTableProto object based on the feature table protobuf """ feature_table = cls( name=feature_table_proto.spec.name, entities=[entity for entity in feature_table_proto.spec.entities], features=[ Feature.from_proto(feature) for feature in feature_table_proto.spec.features ], labels=feature_table_proto.spec.labels, max_age=(None if feature_table_proto.spec.max_age.seconds == 0 and feature_table_proto.spec.max_age.nanos == 0 else feature_table_proto.spec.max_age), batch_source=DataSource.from_proto( feature_table_proto.spec.batch_source), stream_source=( None if not feature_table_proto.spec.stream_source.ByteSize() else DataSource.from_proto(feature_table_proto.spec.stream_source)), ) feature_table._created_timestamp = feature_table_proto.meta.created_timestamp return feature_table
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view. Args: feature_view_proto: A protobuf representation of a feature view. Returns: A FeatureViewProto object based on the feature view protobuf. """ batch_source = DataSource.from_proto(feature_view_proto.spec.batch_source) stream_source = ( DataSource.from_proto(feature_view_proto.spec.stream_source) if feature_view_proto.spec.HasField("stream_source") else None ) feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=dict(feature.labels), ) for feature in feature_view_proto.spec.features ], tags=dict(feature_view_proto.spec.tags), online=feature_view_proto.spec.online, ttl=( None if feature_view_proto.spec.ttl.seconds == 0 and feature_view_proto.spec.ttl.nanos == 0 else feature_view_proto.spec.ttl ), batch_source=batch_source, stream_source=stream_source, ) if feature_view_proto.meta.HasField("created_timestamp"): feature_view.created_timestamp = ( feature_view_proto.meta.created_timestamp.ToDatetime() ) if feature_view_proto.meta.HasField("last_updated_timestamp"): feature_view.last_updated_timestamp = ( feature_view_proto.meta.last_updated_timestamp.ToDatetime() ) for interval in feature_view_proto.meta.materialization_intervals: feature_view.materialization_intervals.append( ( utils.make_tzaware(interval.start_time.ToDatetime()), utils.make_tzaware(interval.end_time.ToDatetime()), ) ) return feature_view
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view. Args: feature_view_proto: A protobuf representation of a feature view. Returns: A FeatureViewProto object based on the feature view protobuf. """ batch_source = DataSource.from_proto( feature_view_proto.spec.batch_source) stream_source = ( DataSource.from_proto(feature_view_proto.spec.stream_source) if feature_view_proto.spec.HasField("stream_source") else None) feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], schema=[ Field.from_proto(field_proto) for field_proto in feature_view_proto.spec.features ], description=feature_view_proto.spec.description, tags=dict(feature_view_proto.spec.tags), owner=feature_view_proto.spec.owner, online=feature_view_proto.spec.online, ttl=(timedelta( days=0) if feature_view_proto.spec.ttl.ToNanoseconds() == 0 else feature_view_proto.spec.ttl.ToTimedelta()), source=batch_source, ) if stream_source: feature_view.stream_source = stream_source # FeatureViewProjections are not saved in the FeatureView proto. # Create the default projection. feature_view.projection = FeatureViewProjection.from_definition( feature_view) if feature_view_proto.meta.HasField("created_timestamp"): feature_view.created_timestamp = ( feature_view_proto.meta.created_timestamp.ToDatetime()) if feature_view_proto.meta.HasField("last_updated_timestamp"): feature_view.last_updated_timestamp = ( feature_view_proto.meta.last_updated_timestamp.ToDatetime()) for interval in feature_view_proto.meta.materialization_intervals: feature_view.materialization_intervals.append(( utils.make_tzaware(interval.start_time.ToDatetime()), utils.make_tzaware(interval.end_time.ToDatetime()), )) return feature_view
def apply_data_source(self, data_source: DataSource, project: str, commit: bool = True): """ Registers a single data source with Feast Args: data_source: A data source that will be registered project: Feast project that this data source belongs to commit: Whether to immediately commit to the registry """ registry = self._prepare_registry_for_changes() for idx, existing_data_source_proto in enumerate( registry.data_sources): if existing_data_source_proto.name == data_source.name: del registry.data_sources[idx] data_source_proto = data_source.to_proto() data_source_proto.data_source_class_type = ( f"{data_source.__class__.__module__}.{data_source.__class__.__name__}" ) data_source_proto.project = project data_source_proto.data_source_class_type = ( f"{data_source.__class__.__module__}.{data_source.__class__.__name__}" ) registry.data_sources.append(data_source_proto) if commit: self.commit()
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, user: str = "user", auth: Optional[Authentication] = None, http_scheme: Optional[str] = None, ) -> RetrievalJob: if not isinstance(data_source, TrinoSource): raise ValueError( f"The data_source object is not a TrinoSource object but is instead a {type(data_source)}" ) from_expression = data_source.get_table_query_string() client = _get_trino_client(config=config, user=user, auth=auth, http_scheme=http_scheme) field_string = ", ".join(join_key_columns + feature_name_columns + [event_timestamp_column]) query = f""" SELECT {field_string} FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return TrinoRetrievalJob( query=query, client=client, config=config, full_feature_names=False, )
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view Args: feature_view_proto: A protobuf representation of a feature view Returns: Returns a FeatureViewProto object based on the feature view protobuf """ feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=feature.labels, ) for feature in feature_view_proto.spec.features ], tags=dict(feature_view_proto.spec.tags), online=feature_view_proto.spec.online, ttl=(None if feature_view_proto.spec.ttl.seconds == 0 and feature_view_proto.spec.ttl.nanos == 0 else feature_view_proto.spec.ttl), input=DataSource.from_proto(feature_view_proto.spec.input), ) feature_view.created_timestamp = feature_view_proto.meta.created_timestamp return feature_view
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, RedshiftSource) from_expression = data_source.get_table_query_string() field_string = ", ".join(join_key_columns + feature_name_columns + [event_timestamp_column]) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region) s3_resource = aws_utils.get_s3_resource(config.offline_store.region) start_date = start_date.astimezone(tz=utc) end_date = end_date.astimezone(tz=utc) query = f""" SELECT {field_string} FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return RedshiftRetrievalJob( query=query, redshift_client=redshift_client, s3_resource=s3_resource, config=config, full_feature_names=False, )
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() client = _get_bigquery_client( project=config.offline_store.project_id, location=config.offline_store.location, ) field_string = ", ".join(join_key_columns + feature_name_columns + [event_timestamp_column]) query = f""" SELECT {field_string} FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') """ return BigQueryRetrievalJob( query=query, client=client, config=config, full_feature_names=False, )
def pull_latest_from_table_or_query( data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> pyarrow.Table: assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) query = f""" SELECT {field_string} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') ) WHERE _feast_row = 1 """ return BigQueryOfflineStore._pull_query(query)
def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest, context): """Produce a training dataset, return a job id that will provide a file reference""" if not self.is_whitelisted(request.project): raise ValueError( f"Project {request.project} is not whitelisted. Please contact your Feast administrator to whitelist it." ) job = start_historical_feature_retrieval_job( client=self.client, project=request.project, entity_source=DataSource.from_proto(request.entity_source), feature_tables=self.client._get_feature_tables_from_feature_refs( list(request.feature_refs), request.project), output_format=request.output_format, output_path=request.output_location, ) output_file_uri = job.get_output_file_uri(block=False) job_start_timestamp = Timestamp() job_start_timestamp.FromDatetime(job.get_start_time()) return GetHistoricalFeaturesResponse( id=job.get_id(), output_file_uri=output_file_uri, job_start_time=job_start_timestamp, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, user: str = "user", auth: Optional[Authentication] = None, http_scheme: Optional[str] = None, ) -> TrinoRetrievalJob: if not isinstance(data_source, TrinoSource): raise ValueError( f"The data_source object is not a TrinoSource but is instead '{type(data_source)}'" ) if not isinstance(config.offline_store, TrinoOfflineStoreConfig): raise ValueError( f"The config.offline_store object is not a TrinoOfflineStoreConfig but is instead '{type(config.offline_store)}'" ) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) client = _get_trino_client(config=config, user=user, auth=auth, http_scheme=http_scheme) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' ) WHERE _feast_row = 1 """ # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return TrinoRetrievalJob( query=query, client=client, config=config, full_feature_names=False, )
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, PostgreSQLSource) from_expression = data_source.get_table_query_string() field_string = ", ".join(join_key_columns + feature_name_columns + [event_timestamp_column]) start_date = start_date.astimezone(tz=utc) end_date = end_date.astimezone(tz=utc) query = f""" SELECT {field_string} FROM {from_expression} WHERE "{event_timestamp_column}" BETWEEN '{start_date}'::timestamptz AND '{end_date}'::timestamptz """ return PostgreSQLRetrievalJob( query=query, config=config, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, SnowflakeSource) assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) from_expression = (data_source.get_table_query_string() ) # returns schema.table as a string if join_key_columns: partition_by_join_key_string = '"' + '", "'.join( join_key_columns) + '"' partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) else: partition_by_join_key_string = "" timestamp_columns = [event_timestamp_column] if created_timestamp_column: timestamp_columns.append(created_timestamp_column) timestamp_desc_string = '"' + '" DESC, "'.join( timestamp_columns) + '" DESC' field_string = ('"' + '", "'.join(join_key_columns + feature_name_columns + timestamp_columns) + '"') if data_source.snowflake_options.warehouse: config.offline_store.warehouse = data_source.snowflake_options.warehouse snowflake_conn = get_snowflake_conn(config.offline_store) query = f""" SELECT {field_string} {f''', TRIM({repr(DUMMY_ENTITY_VAL)}::VARIANT,'"') AS "{DUMMY_ENTITY_ID}"''' if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS "_feast_row" FROM {from_expression} WHERE "{event_timestamp_column}" BETWEEN TO_TIMESTAMP_NTZ({start_date.timestamp()}) AND TO_TIMESTAMP_NTZ({end_date.timestamp()}) ) WHERE "_feast_row" = 1 """ return SnowflakeRetrievalJob( query=query, snowflake_conn=snowflake_conn, config=config, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: spark_session = get_spark_session_or_start_new_with_repoconfig( config.offline_store) assert isinstance(config.offline_store, SparkOfflineStoreConfig) assert isinstance(data_source, SparkSource) warnings.warn( "The spark offline store is an experimental feature in alpha development. " "Some functionality may still be unstable so functionality can change in the future.", RuntimeWarning, ) print("Pulling latest features from spark offline store") from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) start_date_str = _format_datetime(start_date) end_date_str = _format_datetime(end_date) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_ FROM {from_expression} t1 WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}') ) t2 WHERE feast_row_ = 1 """ return SparkRetrievalJob( spark_session=spark_session, query=query, full_feature_names=False, on_demand_feature_views=None, )
def _to_data_source(cls, data_source): """ Convert dict to data source. """ source_type = SourceType(data_source.type).name if ( source_type == "BATCH_FILE" and data_source.file_options.file_format and data_source.file_options.file_url ): data_source_options = FileOptions( file_format=data_source.file_options.file_format, file_url=data_source.file_options.file_url, ) elif source_type == "BATCH_BIGQUERY" and data_source.bigquery_options.table_ref: data_source_options = BigQueryOptions( table_ref=data_source.bigquery_options.table_ref, ) elif ( source_type == "STREAM_KAFKA" and data_source.kafka_options.bootstrap_servers and data_source.kafka_options.topic and data_source.kafka_options.class_path ): data_source_options = KafkaOptions( bootstrap_servers=data_source.kafka_options.bootstrap_servers, class_path=data_source.kafka_options.class_path, topic=data_source.kafka_options.topic, ) elif ( source_type == "STREAM_KINESIS" and data_source.kinesis_options.class_path and data_source.kinesis_options.region and data_source.kinesis_options.stream_name ): data_source_options = KinesisOptions( class_path=data_source.kinesis_options.class_path, region=data_source.kinesis_options.region, stream_name=data_source.kinesis_options.stream_name, ) else: raise ValueError("Could not identify the source type being added") data_source_proto = DataSource( type=data_source.type, field_mapping=data_source.field_mapping, options=data_source_options, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, ).to_proto() return data_source_proto
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, RedshiftSource) assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ( "PARTITION BY " + partition_by_join_key_string ) timestamp_columns = [event_timestamp_column] if created_timestamp_column: timestamp_columns.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC" field_string = ", ".join( join_key_columns + feature_name_columns + timestamp_columns ) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region ) s3_resource = aws_utils.get_s3_resource(config.offline_store.region) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' ) WHERE _feast_row = 1 """ # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return RedshiftRetrievalJob( query=query, redshift_client=redshift_client, s3_resource=s3_resource, config=config, full_feature_names=False, on_demand_feature_views=None, )
def GetHistoricalFeatures(self, request, context): """Produce a training dataset, return a job id that will provide a file reference""" job = self.client.get_historical_features( request.feature_refs, entity_source=DataSource.from_proto(request.entity_source), project=request.project, output_location=request.output_location, ) output_file_uri = job.get_output_file_uri(block=False) return GetHistoricalFeaturesResponse(id=job.get_id(), output_file_uri=output_file_uri)
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, PostgreSQLSource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join( _append_alias(join_key_columns, "a")) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(_append_alias(timestamps, "a")) + " DESC" a_field_string = ", ".join( _append_alias(join_key_columns + feature_name_columns + timestamps, "a")) b_field_string = ", ".join( _append_alias(join_key_columns + feature_name_columns + timestamps, "b")) query = f""" SELECT {b_field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {a_field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM ({from_expression}) a WHERE a."{event_timestamp_column}" BETWEEN '{start_date}'::timestamptz AND '{end_date}'::timestamptz ) b WHERE _feast_row = 1 """ return PostgreSQLRetrievalJob( query=query, config=config, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) client = _get_bigquery_client( project=config.offline_store.project_id, location=config.offline_store.location, ) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') ) WHERE _feast_row = 1 """ # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return BigQueryRetrievalJob( query=query, client=client, config=config, full_feature_names=False, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, RedshiftSource) assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamp_columns = [event_timestamp_column] if created_timestamp_column: timestamp_columns.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamp_columns) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region) s3_resource = aws_utils.get_s3_resource(config.offline_store.region) query = f""" SELECT {field_string} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' ) WHERE _feast_row = 1 """ return RedshiftRetrievalJob( query=query, redshift_client=redshift_client, s3_resource=s3_resource, config=config, )
def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest, context): """Produce a training dataset, return a job id that will provide a file reference""" job = start_historical_feature_retrieval_job( client=self.client, project=request.project, entity_source=DataSource.from_proto(request.entity_source), feature_tables=self.client._get_feature_tables_from_feature_refs( list(request.feature_refs), request.project), output_format=request.output_format, output_path=request.output_location, ) output_file_uri = job.get_output_file_uri(block=False) return GetHistoricalFeaturesResponse(id=job.get_id(), output_file_uri=output_file_uri)
def list_data_sources(self, project: str, allow_cache: bool = False) -> List[DataSource]: """ Retrieve a list of data sources from the registry Args: project: Filter data source based on project name allow_cache: Whether to allow returning data sources from a cached registry Returns: List of data sources """ registry_proto = self._get_registry_proto(allow_cache=allow_cache) data_sources = [] for data_source_proto in registry_proto.data_sources: if data_source_proto.project == project: data_sources.append(DataSource.from_proto(data_source_proto)) return data_sources
def test_feature_table_import_export_yaml(self): batch_source = DataSource( type=SourceType(1).name, field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, options=FileOptions(file_format="avro", file_url="data/test.avro"), timestamp_column="ts_col", date_partition_column="date_partition_col", ) stream_source = DataSource( type=SourceType(3).name, field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, options=KafkaOptions( bootstrap_servers="localhost:9094", class_path="random/path/to/class", topic="test_topic", ), timestamp_column="ts_col", ) test_feature_table = FeatureTable( name="car_driver", features=[ FeatureV2(name="ride_distance", dtype=ValueType.FLOAT).to_proto(), FeatureV2(name="ride_duration", dtype=ValueType.STRING).to_proto(), ], entities=["car_driver_entity"], labels={"team": "matchmaking"}, batch_source=batch_source.to_proto(), stream_source=stream_source.to_proto(), ) # Create a string YAML representation of the feature table string_yaml = test_feature_table.to_yaml() # Create a new feature table object from the YAML string actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml) # Ensure equality is upheld to original feature table assert test_feature_table == actual_feature_table_from_string
def get_data_source(self, name: str, project: str, allow_cache: bool = False) -> DataSource: """ Retrieves a data source. Args: name: Name of data source project: Feast project that this data source belongs to allow_cache: Whether to allow returning this data source from a cached registry Returns: Returns either the specified data source, or raises an exception if none is found """ registry = self._get_registry_proto(allow_cache=allow_cache) for data_source in registry.data_sources: if data_source.project == project and data_source.name == name: return DataSource.from_proto(data_source) raise DataSourceObjectNotFoundException(name, project=project)
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: """ Note that join_key_columns, feature_name_columns, event_timestamp_column, and created_timestamp_column have all already been mapped to column names of the source table and those column names are the values passed into this function. """ assert isinstance(data_source, SparkSource) warnings.warn( "The spark offline store is an experimental feature in alpha development. " "This API is unstable and it could and most probably will be changed in the future.", RuntimeWarning, ) spark_session = get_spark_session_or_start_new_with_repoconfig( store_config=config.offline_store) fields = ", ".join(join_key_columns + feature_name_columns + [event_timestamp_column]) from_expression = data_source.get_table_query_string() start_date = start_date.astimezone(tz=utc) end_date = end_date.astimezone(tz=utc) query = f""" SELECT {fields} FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return SparkRetrievalJob(spark_session=spark_session, query=query, full_feature_names=False)
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) client = _get_bigquery_client(project=config.offline_store.project_id) query = f""" SELECT {field_string} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') ) WHERE _feast_row = 1 """ return BigQueryRetrievalJob(query=query, client=client, config=config)
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, SnowflakeSource) from_expression = data_source.get_table_query_string() field_string = ('"' + '", "'.join(join_key_columns + feature_name_columns + [event_timestamp_column]) + '"') if data_source.snowflake_options.warehouse: config.offline_store.warehouse = data_source.snowflake_options.warehouse snowflake_conn = get_snowflake_conn(config.offline_store) start_date = start_date.astimezone(tz=utc) end_date = end_date.astimezone(tz=utc) query = f""" SELECT {field_string} FROM {from_expression} WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return SnowflakeRetrievalJob( query=query, snowflake_conn=snowflake_conn, config=config, full_feature_names=False, )