def ingest_and_verify(feast_client: Client, feature_table: FeatureTable, original: pd.DataFrame): job = feast_client.start_offline_to_online_ingestion( feature_table, original.event_timestamp.min().to_pydatetime(), original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", f"{feature_table.name}:unique_drivers"]], original[["s2id", "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity(name="long_entity_name" * 10, description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="just1a2featuretable3with4a5really6really7really8really9really10", entities=[entity.name], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data().rename(columns={"s2id": entity.name}) feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ entity.name: key } for key in data_sample[entity.name].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[[entity.name, f"{feature_table.name}:unique_drivers"]], data_sample[[entity.name, "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
class DriverTransformer(kfserving.KFModel): """ A class object for the data handling activities of driver ranking Task and returns a KFServing compatible response. Args: kfserving (class object): The KFModel class from the KFServing modeule is passed here. """ def __init__(self, name: str, predictor_host: str, feast_serving_url: str, entity_ids: List[str], feature_refs: List[str]): """Initialize the model name, predictor host, Feast serving URL, entity IDs, and feature references Args: name (str): Name of the model. predictor_host (str): The host in which the predictor runs. feast_serving_url (str): The Feast serving URL, in the form of <host_name:port> entity_ids (List[str]): The entity IDs for which to retrieve features from the Feast feature store feature_refs (List[str]): The feature references for the features to be retrieved """ super().__init__(name) self.predictor_host = predictor_host self.client = Client(serving_url=feast_serving_url) self.entity_ids = entity_ids self.feature_refs = feature_refs logging.info("Model name = %s", name) logging.info("Predictor host = %s", predictor_host) logging.info("Feast serving URL = %s", feast_serving_url) logging.info("Entity ids = %s", entity_ids) logging.info("Feature refs = %s", feature_refs) self.timeout = 100 def buildEntityRow(self, instance) -> Dict: """Build an entity row and return it as a dict. Args: instance (list): entity id attributes to identify a unique entity Returns: Dict: Returns the entity id attributes as an entity row """ entity_row = {self.entity_ids[i]: instance[i] for i in range(len(instance))} return entity_row def buildPredictRequest(self, inputs, features) -> Dict: """Build the predict request for all entitys and return it as a dict. Args: inputs (Dict): entity ids from KFServing http request features (Dict): entity features extracted from the feature store Returns: Dict: Returns the entity ids with features """ request_data = [] for i in range(len(inputs['instances'])): entity_req = [features[self.feature_refs[j]][i] for j in range(len(self.feature_refs))] for j in range(len(self.entity_ids)): entity_req.append(inputs['instances'][i][j]) request_data.insert(i, entity_req) return {'instances': request_data} def preprocess(self, inputs: Dict) -> Dict: """Pre-process activity of the driver input data. Args: inputs (Dict): KFServing http request Returns: Dict: Returns the request input after ingesting online features """ entity_rows = [self.buildEntityRow(instance) for instance in inputs['instances']] features = self.client.get_online_features(feature_refs=self.feature_refs, entity_rows=entity_rows).to_dict() outputs = self.buildPredictRequest(inputs, features) logging.info("The input for model predict is %s", outputs) return outputs def postprocess(self, inputs: List) -> List: """Post process function of the driver ranking output data. Here we simply pass the raw rankings through. Args: inputs (List): The list of the inputs Returns: List: If a post process functionality is specified, it could convert raw rankings into a different list. """ logging.info("The output from model predict is %s", inputs) return inputs