def online_read( self, project: str, table: Union[FeatureTable, FeatureView], entity_key: EntityKeyProto, ) -> Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]: entity_key_bin = serialize_entity_key(entity_key) conn = self._get_conn() cur = conn.cursor() cur.execute( f"SELECT feature_name, value, event_ts FROM {_table_id(project, table)} WHERE entity_key = ?", (entity_key_bin, ), ) res = {} res_ts = None for feature_name, val_bin, ts in cur.fetchall(): val = ValueProto() val.ParseFromString(val_bin) res[feature_name] = val res_ts = ts if not res: return None, None else: return res_ts, res
def compute_entity_id(entity_key: EntityKeyProto) -> str: """ Compute Entity id given Feast Entity Key for online stores. Remember that Entity here refers to `EntityKeyProto` which is used in some online stores to encode the keys. It has nothing to do with the Entity concept we have in Feast. """ return mmh3.hash_bytes(serialize_entity_key(entity_key)).hex()
def online_read( self, config: RepoConfig, table: Union[FeatureTable, FeatureView], entity_keys: List[EntityKeyProto], requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: pass conn = self._get_conn(config) cur = conn.cursor() result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] project = config.project for entity_key in entity_keys: entity_key_bin = serialize_entity_key(entity_key) cur.execute( f"SELECT feature_name, value, event_ts FROM {_table_id(project, table)} WHERE entity_key = ?", (entity_key_bin,), ) res = {} res_ts = None for feature_name, val_bin, ts in cur.fetchall(): val = ValueProto() val.ParseFromString(val_bin) res[feature_name] = val res_ts = ts if not res: result.append((None, None)) else: result.append((res_ts, res)) return result
def online_read( self, config: RepoConfig, table: FeatureView, entity_keys: List[EntityKeyProto], requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: conn = self._get_conn(config) cur = conn.cursor() result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] with tracing_span(name="remote_call"): # Fetch all entities in one go cur.execute( f"SELECT entity_key, feature_name, value, event_ts " f"FROM {_table_id(config.project, table)} " f"WHERE entity_key IN ({','.join('?' * len(entity_keys))}) " f"ORDER BY entity_key", [ serialize_entity_key(entity_key) for entity_key in entity_keys ], ) rows = cur.fetchall() rows = { k: list(group) for k, group in itertools.groupby(rows, key=lambda r: r[0]) } for entity_key in entity_keys: entity_key_bin = serialize_entity_key(entity_key) res = {} res_ts = None for _, feature_name, val_bin, ts in rows.get(entity_key_bin, []): val = ValueProto() val.ParseFromString(val_bin) res[feature_name] = val res_ts = ts if not res: result.append((None, None)) else: result.append((res_ts, res)) return result
def compute_datastore_entity_id(entity_key: EntityKeyProto) -> str: """ Compute Datastore Entity id given Feast Entity Key. Remember that Datastore Entity is a concept from the Datastore data model, that has nothing to do with the Entity concept we have in Feast. """ return mmh3.hash_bytes(serialize_entity_key(entity_key)).hex()
def online_write_batch( self, config: RepoConfig, table: Union[FeatureTable, FeatureView], data: List[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], ) -> None: conn = self._get_conn(config) project = config.project with conn: for entity_key, values, timestamp, created_ts in data: entity_key_bin = serialize_entity_key(entity_key) timestamp = _to_naive_utc(timestamp) if created_ts is not None: created_ts = _to_naive_utc(created_ts) for feature_name, val in values.items(): conn.execute( f""" UPDATE {_table_id(project, table)} SET value = ?, event_ts = ?, created_ts = ? WHERE (entity_key = ? AND feature_name = ?) """, ( # SET val.SerializeToString(), timestamp, created_ts, # WHERE entity_key_bin, feature_name, ), ) conn.execute( f"""INSERT OR IGNORE INTO {_table_id(project, table)} (entity_key, feature_name, value, event_ts, created_ts) VALUES (?, ?, ?, ?, ?)""", ( entity_key_bin, feature_name, val.SerializeToString(), timestamp, created_ts, ), ) if progress: progress(1)
def online_write_batch( self, project: str, table: Union[FeatureTable, FeatureView], data: List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]], ) -> None: conn = self._get_conn() with conn: for entity_key, values, timestamp, created_ts in data: for feature_name, val in values.items(): entity_key_bin = serialize_entity_key(entity_key) timestamp = _to_naive_utc(timestamp) if created_ts is not None: created_ts = _to_naive_utc(created_ts) conn.execute( f""" UPDATE {_table_id(project, table)} SET value = ?, event_ts = ?, created_ts = ? WHERE (event_ts < ? OR (event_ts = ? AND (created_ts IS NULL OR ? IS NULL OR created_ts < ?))) AND (entity_key = ? AND feature_name = ?) """, ( # SET val.SerializeToString(), timestamp, created_ts, # WHERE timestamp, timestamp, created_ts, created_ts, entity_key_bin, feature_name, ), ) conn.execute( f"""INSERT OR IGNORE INTO {_table_id(project, table)} (entity_key, feature_name, value, event_ts, created_ts) VALUES (?, ?, ?, ?, ?)""", ( entity_key_bin, feature_name, val.SerializeToString(), timestamp, created_ts, ), )
def online_write_batch( self, config: RepoConfig, table: FeatureView, data: List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]], progress: Optional[Callable[[int], Any]], ) -> None: project = config.project with self._get_conn(config) as conn, conn.cursor() as cur: insert_values = [] for entity_key, values, timestamp, created_ts in data: entity_key_bin = serialize_entity_key(entity_key) timestamp = _to_naive_utc(timestamp) if created_ts is not None: created_ts = _to_naive_utc(created_ts) for feature_name, val in values.items(): insert_values.append(( entity_key_bin, feature_name, val.SerializeToString(), timestamp, created_ts, )) # Controll the batch so that we can update the progress batch_size = 5000 for i in range(0, len(insert_values), batch_size): cur_batch = insert_values[i:i + batch_size] execute_values( cur, sql.SQL( """ INSERT INTO {} (entity_key, feature_name, value, event_ts, created_ts) VALUES %s ON CONFLICT (entity_key, feature_name) DO UPDATE SET value = EXCLUDED.value, event_ts = EXCLUDED.event_ts, created_ts = EXCLUDED.created_ts; """, ).format(sql.Identifier(_table_id(project, table))), cur_batch, page_size=batch_size, ) if progress: progress(len(cur_batch))
def online_read( self, config: RepoConfig, table: FeatureView, entity_keys: List[EntityKeyProto], requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] project = config.project with self._get_conn(config) as conn, conn.cursor() as cur: # Collecting all the keys to a list allows us to make fewer round trips # to PostgreSQL keys = [] for entity_key in entity_keys: keys.append(serialize_entity_key(entity_key)) cur.execute( sql.SQL(""" SELECT entity_key, feature_name, value, event_ts FROM {} WHERE entity_key = ANY(%s); """).format(sql.Identifier(_table_id(project, table)), ), (keys, ), ) rows = cur.fetchall() # Since we don't know the order returned from PostgreSQL we'll need # to construct a dict to be able to quickly look up the correct row # when we iterate through the keys since they are in the correct order values_dict = defaultdict(list) for row in rows if rows is not None else []: values_dict[row[0].tobytes()].append(row[1:]) for key in keys: if key in values_dict: value = values_dict[key] res = {} for feature_name, value_bin, event_ts in value: val = ValueProto() val.ParseFromString(value_bin) res[feature_name] = val result.append((event_ts, res)) else: result.append((None, None)) return result
def _redis_key(project: str, entity_key: EntityKeyProto) -> bytes: key: List[bytes] = [ serialize_entity_key(entity_key), project.encode("utf-8") ] return b"".join(key)