def _convert_arrow_to_proto( table: pyarrow.Table, feature_view: FeatureView ) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: rows_to_write = [] for row in zip(*table.to_pydict().values()): entity_key = EntityKeyProto() for entity_name in feature_view.entities: entity_key.entity_names.append(entity_name) idx = table.column_names.index(entity_name) value = python_value_to_proto_value(row[idx]) entity_key.entity_values.append(value) feature_dict = {} for feature in feature_view.features: idx = table.column_names.index(feature.name) value = python_value_to_proto_value(row[idx]) feature_dict[feature.name] = value event_timestamp_idx = table.column_names.index( feature_view.input.event_timestamp_column) event_timestamp = row[event_timestamp_idx] if feature_view.input.created_timestamp_column is not None: created_timestamp_idx = table.column_names.index( feature_view.input.created_timestamp_column) created_timestamp = row[created_timestamp_idx] else: created_timestamp = None rows_to_write.append( (entity_key, feature_dict, event_timestamp, created_timestamp)) return rows_to_write
def _convert_arrow_to_proto( table: pyarrow.Table, feature_view: FeatureView, join_keys: List[str], ) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: rows_to_write = [] def _coerce_datetime(ts): """ Depending on underlying time resolution, arrow to_pydict() sometimes returns pandas timestamp type (for nanosecond resolution), and sometimes you get standard python datetime (for microsecond resolution). While pandas timestamp class is a subclass of python datetime, it doesn't always behave the same way. We convert it to normal datetime so that consumers downstream don't have to deal with these quirks. """ if isinstance(ts, pandas.Timestamp): return ts.to_pydatetime() else: return ts column_names_idx = {k: i for i, k in enumerate(table.column_names)} for row in zip(*table.to_pydict().values()): entity_key = EntityKeyProto() for join_key in join_keys: entity_key.join_keys.append(join_key) idx = column_names_idx[join_key] value = python_value_to_proto_value(row[idx]) entity_key.entity_values.append(value) feature_dict = {} for feature in feature_view.features: idx = column_names_idx[feature.name] value = python_value_to_proto_value(row[idx], feature.dtype) feature_dict[feature.name] = value event_timestamp_idx = column_names_idx[ feature_view.batch_source.event_timestamp_column] event_timestamp = _coerce_datetime(row[event_timestamp_idx]) if feature_view.batch_source.created_timestamp_column: created_timestamp_idx = column_names_idx[ feature_view.batch_source.created_timestamp_column] created_timestamp = _coerce_datetime(row[created_timestamp_idx]) else: created_timestamp = None rows_to_write.append( (entity_key, feature_dict, event_timestamp, created_timestamp)) return rows_to_write
def _convert_arrow_to_proto( table: Union[pyarrow.Table, pyarrow.RecordBatch], feature_view: FeatureView, join_keys: List[str], ) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: # Handle join keys join_key_values = {k: table.column(k).to_pylist() for k in join_keys} entity_keys = [ EntityKeyProto( join_keys=join_keys, entity_values=[ python_value_to_proto_value(join_key_values[k][idx]) for k in join_keys ], ) for idx in range(table.num_rows) ] # Serialize the features per row feature_dict = { feature.name: [ python_value_to_proto_value(val, feature.dtype) for val in table.column(feature.name).to_pylist() ] for feature in feature_view.features } features = [ dict(zip(feature_dict, vars)) for vars in zip(*feature_dict.values()) ] # Convert event_timestamps event_timestamps = [ _coerce_datetime(val) for val in table.column( feature_view.batch_source.event_timestamp_column).to_pylist() ] # Convert created_timestamps if they exist if feature_view.batch_source.created_timestamp_column: created_timestamps = [ _coerce_datetime(val) for val in table.column(feature_view.batch_source. created_timestamp_column).to_pylist() ] else: created_timestamps = [None] * table.num_rows return list( zip(entity_keys, features, event_timestamps, created_timestamps))
def _augment_response_with_on_demand_transforms( self, feature_refs: List[str], full_feature_names: bool, initial_response: OnlineResponse, result_rows: List[GetOnlineFeaturesResponse.FieldValues], ) -> OnlineResponse: all_on_demand_feature_views = { view.name: view for view in self._registry.list_on_demand_feature_views( project=self.project, allow_cache=True) } all_odfv_feature_names = all_on_demand_feature_views.keys() if len(all_on_demand_feature_views) == 0: return initial_response initial_response_df = initial_response.to_df() odfv_feature_refs = defaultdict(list) for feature_ref in feature_refs: view_name, feature_name = feature_ref.split(":") if view_name in all_odfv_feature_names: odfv_feature_refs[view_name].append(feature_name) # Apply on demand transformations for odfv_name, _feature_refs in odfv_feature_refs.items(): odfv = all_on_demand_feature_views[odfv_name] transformed_features_df = odfv.get_transformed_features_df( full_feature_names, initial_response_df) for row_idx in range(len(result_rows)): result_row = result_rows[row_idx] selected_subset = [ f for f in transformed_features_df.columns if f in _feature_refs ] for transformed_feature in selected_subset: transformed_feature_name = ( f"{odfv.name}__{transformed_feature}" if full_feature_names else transformed_feature) proto_value = python_value_to_proto_value( transformed_features_df[transformed_feature]. values[row_idx]) result_row.fields[transformed_feature_name].CopyFrom( proto_value) result_row.statuses[ transformed_feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT return OnlineResponse( GetOnlineFeaturesResponse(field_values=result_rows))
def get_online_features( self, features: Union[List[str], FeatureService], entity_rows: List[Dict[str, Any]], feature_refs: Optional[List[str]] = None, full_feature_names: bool = False, ) -> OnlineResponse: """ Retrieves the latest online feature data. Note: This method will download the full feature registry the first time it is run. If you are using a remote registry like GCS or S3 then that may take a few seconds. The registry remains cached up to a TTL duration (which can be set to infinity). If the cached registry is stale (more time than the TTL has passed), then a new registry will be downloaded synchronously by this method. This download may introduce latency to online feature retrieval. In order to avoid synchronous downloads, please call refresh_registry() prior to the TTL being reached. Remember it is possible to set the cache TTL to infinity (cache forever). Args: features: List of feature references that will be returned for each entity. Each feature reference should have the following format: "feature_table:feature" where "feature_table" & "feature" refer to the feature and feature table names respectively. Only the feature name is required. entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair. Returns: OnlineResponse containing the feature data in records. Raises: Exception: No entity with the specified name exists. Examples: Materialize all features into the online store over the interval from 3 hours ago to 10 minutes ago, and then retrieve these online features. >>> from feast import FeatureStore, RepoConfig >>> fs = FeatureStore(repo_path="feature_repo") >>> online_response = fs.get_online_features( ... features=[ ... "driver_hourly_stats:conv_rate", ... "driver_hourly_stats:acc_rate", ... "driver_hourly_stats:avg_daily_trips", ... ], ... entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}, {"driver_id": 1003}, {"driver_id": 1004}], ... ) >>> online_response_dict = online_response.to_dict() """ _feature_refs = self._get_features(features, feature_refs) all_feature_views = self._list_feature_views(allow_cache=True, hide_dummy_entity=False) all_on_demand_feature_views = self._registry.list_on_demand_feature_views( project=self.project, allow_cache=True) _validate_feature_refs(_feature_refs, full_feature_names) grouped_refs, grouped_odfv_refs = _group_feature_refs( _feature_refs, all_feature_views, all_on_demand_feature_views) if len(grouped_odfv_refs) > 0: log_event(UsageEvent.GET_ONLINE_FEATURES_WITH_ODFV) feature_views = list(view for view, _ in grouped_refs) entityless_case = DUMMY_ENTITY_NAME in [ entity_name for feature_view in feature_views for entity_name in feature_view.entities ] provider = self._get_provider() entities = self._list_entities(allow_cache=True, hide_dummy_entity=False) entity_name_to_join_key_map = {} for entity in entities: entity_name_to_join_key_map[entity.name] = entity.join_key needed_request_data_features = self._get_needed_request_data_features( grouped_odfv_refs) join_key_rows = [] request_data_features: Dict[str, List[Any]] = {} # Entity rows may be either entities or request data. for row in entity_rows: join_key_row = {} for entity_name, entity_value in row.items(): # Found request data if entity_name in needed_request_data_features: if entity_name not in request_data_features: request_data_features[entity_name] = [] request_data_features[entity_name].append(entity_value) continue try: join_key = entity_name_to_join_key_map[entity_name] except KeyError: raise EntityNotFoundException(entity_name, self.project) join_key_row[join_key] = entity_value if entityless_case: join_key_row[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL if len(join_key_row) > 0: # May be empty if this entity row was request data join_key_rows.append(join_key_row) if len(needed_request_data_features) != len( request_data_features.keys()): raise RequestDataNotFoundInEntityRowsException( feature_names=needed_request_data_features) entity_row_proto_list = _infer_online_entity_rows(join_key_rows) union_of_entity_keys: List[EntityKeyProto] = [] result_rows: List[GetOnlineFeaturesResponse.FieldValues] = [] for entity_row_proto in entity_row_proto_list: # Create a list of entity keys to filter down for each feature view at lookup time. union_of_entity_keys.append(_entity_row_to_key(entity_row_proto)) # Also create entity values to append to the result result_rows.append(_entity_row_to_field_values(entity_row_proto)) # Add more feature values to the existing result rows for the request data features for feature_name, feature_values in request_data_features.items(): for row_idx, feature_value in enumerate(feature_values): result_row = result_rows[row_idx] result_row.fields[feature_name].CopyFrom( python_value_to_proto_value(feature_value)) result_row.statuses[ feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT for table, requested_features in grouped_refs: self._populate_result_rows_from_feature_view( entity_name_to_join_key_map, full_feature_names, provider, requested_features, result_rows, table, union_of_entity_keys, ) initial_response = OnlineResponse( GetOnlineFeaturesResponse(field_values=result_rows)) return self._augment_response_with_on_demand_transforms( _feature_refs, full_feature_names, initial_response, result_rows)