def feast_client(pytestconfig, ingestion_job_jar): redis_host, redis_port = pytestconfig.getoption("redis_url").split(":") if pytestconfig.getoption("env") == "local": return Client( core_url=pytestconfig.getoption("core_url"), serving_url=pytestconfig.getoption("serving_url"), spark_launcher="standalone", spark_standalone_master="local", spark_home=os.getenv("SPARK_HOME") or os.path.dirname(pyspark.__file__), spark_ingestion_jar=ingestion_job_jar, redis_host=redis_host, redis_port=redis_port, ) if pytestconfig.getoption("env") == "gcloud": return Client( core_url=pytestconfig.getoption("core_url"), serving_url=pytestconfig.getoption("serving_url"), spark_launcher="dataproc", dataproc_cluster_name=pytestconfig.getoption("dataproc_cluster_name"), dataproc_project=pytestconfig.getoption("dataproc_project"), dataproc_region=pytestconfig.getoption("dataproc_region"), dataproc_staging_location=os.path.join( pytestconfig.getoption("staging_path"), "dataproc" ), spark_ingestion_jar=ingestion_job_jar, )
def test_restarting_failed_jobs(feature_table): """ If configured - restart failed jobs """ feast_client = FeastClient( job_service_pause_between_jobs=0, job_service_retry_failed_jobs=True, options={"whitelisted_projects": "default,ride"}, ) feast_client.list_projects = Mock(return_value=["default"]) feast_client.list_feature_tables = Mock() spark_client = Client(feast_client) spark_client.list_jobs = Mock() spark_client.start_stream_to_online_ingestion = Mock() spark_client.feature_store.list_feature_tables.return_value = [ feature_table ] spark_client.list_jobs.return_value = [] ensure_stream_ingestion_jobs(spark_client, all_projects=True) spark_client.list_jobs.assert_called_once_with(include_terminated=False) spark_client.start_stream_to_online_ingestion.assert_called_once_with( feature_table, [], project="default")
def __init__(self, name: str, predictor_host: str, feast_serving_url: str, entity_ids: List[str], feature_refs: List[str]): """Initialize the model name, predictor host, Feast serving URL, entity IDs, and feature references Args: name (str): Name of the model. predictor_host (str): The host in which the predictor runs. feast_serving_url (str): The Feast serving URL, in the form of <host_name:port> entity_ids (List[str]): The entity IDs for which to retrieve features from the Feast feature store feature_refs (List[str]): The feature references for the features to be retrieved """ super().__init__(name) self.predictor_host = predictor_host self.client = Client(serving_url=feast_serving_url) self.entity_ids = entity_ids self.feature_refs = feature_refs logging.info("Model name = %s", name) logging.info("Predictor host = %s", predictor_host) logging.info("Feast serving URL = %s", feast_serving_url) logging.info("Entity ids = %s", entity_ids) logging.info("Feature refs = %s", feature_refs) self.timeout = 100
def ingest_and_verify(feast_client: Client, feature_table: FeatureTable, original: pd.DataFrame): job = feast_client.start_offline_to_online_ingestion( feature_table, original.event_timestamp.min().to_pydatetime(), original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", f"{feature_table.name}:unique_drivers"]], original[["s2id", "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG) self._feast_service = conf.get_string( FeastExtractor.FEAST_SERVICE_CONFIG_KEY) self._describe_feature_tables = conf.get_bool( FeastExtractor.DESCRIBE_FEATURE_TABLES) self._client = Client( core_url=conf.get_string(FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY)) self._extract_iter: Union[None, Iterator] = None
def feast_client(): c = FeastClient( job_service_pause_between_jobs=0, options={"whitelisted_projects": "default,ride"}, ) c.list_projects = Mock(return_value=["default", "ride", "invalid_project"]) c.list_feature_tables = Mock() yield c
def ensure_stream_ingestion_jobs(client: feast.Client, all_projects: bool): """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs. More concretely, it will determine - which stream ingestion jobs are running - which stream ingestion jobs should be running And it'll do 2 kinds of operations - Cancel all running jobs that should not be running - Start all non-existent jobs that should be running Args: all_projects (bool): If true, runs the check for all project. Otherwise only checks the client's current project. """ projects = client.list_projects() if all_projects else [client.project] expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs( client, projects) expected_job_hashes = set(expected_job_hash_to_table_refs.keys()) jobs_by_hash: Dict[str, StreamIngestionJob] = {} for job in client.list_jobs(include_terminated=False): if isinstance(job, StreamIngestionJob): jobs_by_hash[job.get_hash()] = job existing_job_hashes = set(jobs_by_hash.keys()) job_hashes_to_cancel = existing_job_hashes - expected_job_hashes job_hashes_to_start = expected_job_hashes - existing_job_hashes logging.debug( f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}" ) for job_hash in job_hashes_to_cancel: job = jobs_by_hash[job_hash] logging.info( f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}" ) try: job.cancel() except FailedPrecondition as exc: logging.warning(f"Job canceling failed with exception {exc}") for job_hash in job_hashes_to_start: # Any job that we wish to start should be among expected table refs map project, table_name = expected_job_hash_to_table_refs[job_hash] logging.info( f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}" ) feature_table = client.get_feature_table(name=table_name, project=project) client.start_stream_to_online_ingestion(feature_table, [], project=project)
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client, feast_spark_client: SparkClient): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) batch_source = FileSource( file_format=ParquetFormat(), file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) feature_table = FeatureTable( name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"), entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "0 0 * * *") config.load_incluster_config() k8s_api = client.CustomObjectsApi() def get_scheduled_spark_application(): job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}". encode()).hexdigest() resource_name = f"feast-{job_hash}" return k8s_api.get_namespaced_custom_object( group="sparkoperator.k8s.io", version="v1beta2", namespace=pytestconfig.getoption("k8s_namespace"), plural="scheduledsparkapplications", name=resource_name, ) response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "0 0 * * *" feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "1 0 * * *") response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "1 0 * * *" feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
def _get_expected_job_hash_to_table_refs( client: feast.Client, projects: List[str]) -> Dict[str, Tuple[str, str]]: """ Checks all feature tables for the requires project(s) and determines all required stream ingestion jobs from them. Outputs a map of the expected job_hash to a tuple of (project, table_name). Args: all_projects (bool): If true, runs the check for all project. Otherwise only checks the current project. Returns: Dict[str, Tuple[str, str]]: Map of job_hash -> (project, table_name) for expected stream ingestion jobs """ job_hash_to_table_refs = {} for project in projects: feature_tables = client.list_feature_tables(project) for feature_table in feature_tables: if feature_table.stream_source is not None: params = get_stream_to_online_ingestion_params( client, project, feature_table, []) job_hash = params.get_job_hash() job_hash_to_table_refs[job_hash] = (project, feature_table.name) return job_hash_to_table_refs
def start_job(feast_client: Client, feature_table: FeatureTable, pytestconfig): if pytestconfig.getoption("scheduled_streaming_job"): return job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) return job
def test_feature_table_whitelist(): with tempfile.NamedTemporaryFile() as tmp: tmp.writelines([b"project1:table1\n", b"project1:table2"]) tmp.seek(0) feast_client = Client(whitelisted_feature_tables_path=tmp.name) job_client = JobClient(feast_client) job_servicer = JobServiceServicer(job_client) assert not job_servicer.is_feature_table_whitelisted("project2", "table1") assert job_servicer.is_feature_table_whitelisted("project1", "table1")
def feast_client( pytestconfig, ingestion_job_jar, redis_server: RedisExecutor, feast_core: Tuple[str, int], feast_serving: Tuple[str, int], local_staging_path, ): if pytestconfig.getoption("env") == "local": return Client( core_url=f"{feast_core[0]}:{feast_core[1]}", serving_url=f"{feast_serving[0]}:{feast_serving[1]}", spark_launcher="standalone", spark_standalone_master="local", spark_home=os.getenv("SPARK_HOME") or os.path.dirname(pyspark.__file__), spark_ingestion_jar=ingestion_job_jar, redis_host=redis_server.host, redis_port=redis_server.port, spark_staging_location=os.path.join(local_staging_path, "spark"), historical_feature_output_location=os.path.join( local_staging_path, "historical_output"), ) if pytestconfig.getoption("env") == "gcloud": return Client( core_url=f"{feast_core[0]}:{feast_core[1]}", serving_url=f"{feast_serving[0]}:{feast_serving[1]}", spark_launcher="dataproc", dataproc_cluster_name=pytestconfig.getoption( "dataproc_cluster_name"), dataproc_project=pytestconfig.getoption("dataproc_project"), dataproc_region=pytestconfig.getoption("dataproc_region"), spark_staging_location=os.path.join(local_staging_path, "dataproc"), spark_ingestion_jar=ingestion_job_jar, redis_host=pytestconfig.getoption("redis_url").split(":")[0], redis_port=pytestconfig.getoption("redis_url").split(":")[1], historical_feature_output_location=os.path.join( local_staging_path, "historical_output"), )
def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_telemetry_off_v09(mocker): old_environ = dict(os.environ) os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id os.environ["FEAST_TELEMETRY"] = "False" test_client = Client(serving_url=None, core_url=None, telemetry=False) test_client.set_project("project1") entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) mocker.patch.object( test_client, "_apply_entity", return_value=None, ) test_client.apply(entity) os.environ.clear() os.environ.update(old_environ) sleep(30) rows = read_bigquery_telemetry_id(test_telemetry_id) assert rows.total_rows == 0
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset, feast_client: Client, feast_spark_client: SparkClient): original = generate_data() bq_project = pytestconfig.getoption("bq_project") bq_client = bigquery.Client(project=bq_project) source_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}", ) bq_client.load_table_from_dataframe(original, source_ref).result() view_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(view_ref) view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`" bq_client.create_table(view) entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="bq_ingestion", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}", ), ) feast_client.apply(entity) feast_client.apply(feature_table) ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def test_telemetry_on_v09(mocker): # Setup environment old_environ = dict(os.environ) os.environ["FEAST_IS_TELEMETRY_TEST"] = "True" test_telemetry_id = str(uuid.uuid4()) os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id test_client = Client(serving_url=None, core_url=None, telemetry=True) test_client.set_project("project1") entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) mocker.patch.object( test_client, "_apply_entity", return_value=None, ) test_client.apply(entity) os.environ.clear() os.environ.update(old_environ) ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
def test_offline_ingestion( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity(name="long_entity_name" * 10, description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="just1a2featuretable3with4a5really6really7really8really9really10", entities=[entity.name], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data().rename(columns={"s2id": entity.name}) feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ entity.name: key } for key in data_sample[entity.name].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[[entity.name, f"{feature_table.name}:unique_drivers"]], data_sample[[entity.name, "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def get_conn(self): """ Initialize a Feast client. """ if self.client: return self.client self.connection = self.get_connection(self.conn_id) self.extras = self.connection.extra_dejson self.client = Client(core_url=self.extras["core_url"], serving_url=self.extras.get("serving_url"), project=self.extras.get("project")) return self.client
def client_with_local_spark(tmpdir): import pyspark spark_staging_location = f"file://{os.path.join(tmpdir, 'staging')}" historical_feature_output_location = ( f"file://{os.path.join(tmpdir, 'historical_feature_retrieval_output')}" ) return Client( core_url=f"localhost:{free_port}", spark_launcher="standalone", spark_standalone_master="local", spark_home=os.path.dirname(pyspark.__file__), spark_staging_location=spark_staging_location, historical_feature_output_location=historical_feature_output_location, historical_feature_output_format="parquet", )
def test_historical_features( feast_client: Client, batch_source: Union[BigQuerySource, FileSource] ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] job = feast_client.get_historical_features(feature_refs, customers_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), )
def test_list_jobs_long_table_name(feast_client: Client, batch_source: Union[BigQuerySource, FileSource]): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name= "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data() feast_client.ingest(feature_table, data_sample) job = feast_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_client.list_jobs(include_terminated=True, table_name=feature_table.name) ] assert job.get_id() in all_job_ids
def feast_client( pytestconfig, ingestion_job_jar, redis_server: RedisExecutor, feast_core: Tuple[str, int], feast_serving: Tuple[str, int], local_staging_path, feast_jobservice: Optional[Tuple[str, int]], enable_auth, ): if feast_jobservice is None: job_service_env = dict() else: job_service_env = dict( job_service_url=f"{feast_jobservice[0]}:{feast_jobservice[1]}") if pytestconfig.getoption("env") == "local": import pyspark return Client( core_url=f"{feast_core[0]}:{feast_core[1]}", serving_url=f"{feast_serving[0]}:{feast_serving[1]}", spark_launcher="standalone", spark_standalone_master="local", spark_home=os.getenv("SPARK_HOME") or os.path.dirname(pyspark.__file__), spark_ingestion_jar=ingestion_job_jar, redis_host=redis_server.host, redis_port=redis_server.port, spark_staging_location=os.path.join(local_staging_path, "spark"), historical_feature_output_location=os.path.join( local_staging_path, "historical_output"), ingestion_drop_invalid_rows=True, **job_service_env, ) elif pytestconfig.getoption("env") == "gcloud": c = Client( core_url=f"{feast_core[0]}:{feast_core[1]}", serving_url=f"{feast_serving[0]}:{feast_serving[1]}", spark_launcher="dataproc", dataproc_cluster_name=pytestconfig.getoption( "dataproc_cluster_name"), dataproc_project=pytestconfig.getoption("dataproc_project"), dataproc_region=pytestconfig.getoption("dataproc_region"), spark_staging_location=os.path.join(local_staging_path, "dataproc"), spark_ingestion_jar=ingestion_job_jar, redis_host=pytestconfig.getoption("redis_url").split(":")[0], redis_port=pytestconfig.getoption("redis_url").split(":")[1], historical_feature_output_location=os.path.join( local_staging_path, "historical_output"), ingestion_drop_invalid_rows=True, grpc_connection_timeout=30, **job_service_env, ) elif pytestconfig.getoption("env") == "aws": return Client( core_url=f"{feast_core[0]}:{feast_core[1]}", serving_url=f"{feast_serving[0]}:{feast_serving[1]}", spark_launcher="emr", emr_cluster_id=pytestconfig.getoption("emr_cluster_id"), emr_region=pytestconfig.getoption("emr_region"), spark_staging_location=os.path.join(local_staging_path, "emr"), emr_log_location=os.path.join(local_staging_path, "emr_logs"), spark_ingestion_jar=ingestion_job_jar, redis_host=pytestconfig.getoption("redis_url").split(":")[0], redis_port=pytestconfig.getoption("redis_url").split(":")[1], historical_feature_output_location=os.path.join( local_staging_path, "historical_output"), ingestion_drop_invalid_rows=True, ) elif pytestconfig.getoption("env") == "k8s": return Client( core_url=f"{feast_core[0]}:{feast_core[1]}", serving_url=f"{feast_serving[0]}:{feast_serving[1]}", spark_launcher="k8s", spark_staging_location=os.path.join(local_staging_path, "k8s"), spark_ingestion_jar=ingestion_job_jar, redis_host=pytestconfig.getoption("redis_url").split(":")[0], redis_port=pytestconfig.getoption("redis_url").split(":")[1], historical_feature_output_location=os.path.join( local_staging_path, "historical_output"), ) else: raise KeyError(f"Unknown environment {pytestconfig.getoption('env')}") c.set_project(pytestconfig.getoption("feast_project")) return c
def test_historical_features( feast_client: Client, tfrecord_feast_client: Client, batch_source: Union[BigQuerySource, FileSource], ): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] # remove microseconds because job.get_start_time() does not contain microseconds job_submission_time = datetime.utcnow().replace(microsecond=0) job = feast_client.get_historical_features(feature_refs, customers_df) assert job.get_start_time() >= job_submission_time assert job.get_start_time() <= job_submission_time + timedelta(hours=1) output_dir = job.get_output_file_uri() # will both be None if not using Azure blob storage account_name, account_key = _get_azure_creds(feast_client) joined_df = read_parquet(output_dir, azure_account_name=account_name, azure_account_key=account_key) expected_joined_df = pd.DataFrame({ "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), ) job = tfrecord_feast_client.get_historical_features( feature_refs, customers_df) job.get_output_file_uri() assert job.get_status() == SparkJobStatus.COMPLETED
from feast import Client # type: ignore from feast.data_format import ParquetFormat from feast.data_source import FileSource # type: ignore from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable # type: ignore from feast.value_type import ValueType if __name__ == "__main__": if feast.__version__ > FEAST_MIN_VERSION: raise Exception( f"this code does not work with feast > {FEAST_MIN_VERSION}. Found {feast.__version__}" ) test_client = Client(core_url="testfeast:6565") # create dummy entity since Feast demands it entity_1 = Entity( name="dummy_entity_1", description="Dummy entity 1", value_type=ValueType.STRING, labels={"key": "val"}, ) # create dummy entity since Feast demands it entity_2 = Entity( name="dummy_entity_2", description="Dummy entity 2", value_type=ValueType.INT32, labels={"key": "val"},
def test_streaming_ingestion(feast_client: Client, local_staging_path: str, kafka_server): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( "event_timestamp", "event_timestamp", kafka_broker, AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60) try: original = generate_data()[[ "s2id", "unique_drivers", "event_timestamp" ]] for record in original.to_dict("records"): record["event_timestamp"] = ( record["event_timestamp"].to_pydatetime().replace( tzinfo=pytz.utc)) send_avro_record_to_kafka( topic_name, record, bootstrap_servers=kafka_broker, avro_schema_json=avro_schema(), ) def get_online_features(): features = feast_client.get_online_features( ["drivers_stream:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() df = pd.DataFrame.from_dict(features) return df, not df["drivers_stream:unique_drivers"].isna().any() ingested = wait_retry_backoff(get_online_features, 60) finally: job.cancel() pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}), )
def stop_job(job, feast_client: Client, feature_table: FeatureTable): if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name)
def cli(core_url, output_path): client = Client(core_url=core_url) tables = client.list_feature_tables() # sort tables by name for consistent outputs tables = sorted(tables, key=lambda x: x.name) parsed_tables = [] for table in tables: # sort entities by name for consistent outputs entities = sorted(table.entities) batch_source = None stream_source = None # platform and name for constructing URN later on batch_source_platform = "unknown" stream_source_platform = "unknown" batch_source_name = "unknown" stream_source_name = "unknown" if isinstance(table.batch_source, BigQuerySource): batch_source = "BigQuerySource" batch_source_platform = "bigquery" batch_source_name = table.batch_source.bigquery_options.table_ref if isinstance(table.batch_source, FileSource): batch_source = "FileSource" batch_source_platform = "file" # replace slashes because the react frontend can't parse them correctly batch_source_name = table.batch_source.file_options.file_url.replace( "/", "." ) # replace redundant file prefix if batch_source_name.startswith("file:.."): batch_source_name = batch_source_name[7:] if isinstance(table.stream_source, KafkaSource): stream_source = "KafkaSource" stream_source_platform = "kafka" stream_source_name = table.stream_source.kafka_options.topic if isinstance(table.stream_source, KinesisSource): stream_source = "KinesisSource" stream_source_platform = "kinesis" stream_source_name = f"{table.stream_source.kinesis_options.region}-{table.stream_source.kinesis_options.stream_name}" # currently unused in MCE outputs, but useful for debugging stream_source_config = table.to_dict()["spec"].get("streamSource") batch_source_config = table.to_dict()["spec"]["batchSource"] raw_entities = [ client.get_entity(entity_name) for entity_name in table.entities ] raw_entities = sorted(raw_entities, key=lambda x: x.name) source_info = { "batch_source": batch_source, "stream_source": stream_source, "batch_source_config": batch_source_config, "stream_source_config": stream_source_config, "batch_source_platform": batch_source_platform, "stream_source_platform": stream_source_platform, "batch_source_name": batch_source_name, "stream_source_name": stream_source_name, } # sort entities by name for consistent outputs entities = sorted( [ { "name": x.name, "type": x.value_type.name, "description": x.description, **source_info, } for x in raw_entities ], key=lambda x: x["name"], ) # sort features by name for consistent outputs features = sorted( [ {"name": x.name, "type": x.dtype.name, **source_info} for x in table.features ], key=lambda x: x["name"], ) parsed_tables.append( { "name": table.name, "entities": entities, "features": features, } ) if output_path is not None: with open(output_path, "w") as f: json.dump(parsed_tables, f) else: print(parsed_tables)
Feature(name="int64_feature", dtype=ValueType.INT64), Feature(name="int32_feature", dtype=ValueType.INT32), Feature(name="string_feature", dtype=ValueType.STRING), Feature(name="bytes_feature", dtype=ValueType.BYTES), Feature(name="bool_feature", dtype=ValueType.BOOL), Feature(name="double_feature", dtype=ValueType.DOUBLE), Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), ], ) client = Client(core_url=feast_core_url, serving_url=feast_online_serving_url) # Register feature set client.apply(all_types_fs_expected) df.info() df.describe() df.head() # Ingest tdata client.ingest(all_types_fs_expected, df) # Wait for data to be available def try_get_features(): online_request_entity = [{"user_id": 1001}]
def test_historical_features(feast_client: Client, local_staging_path: str): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), os.path.join(local_staging_path, "transactions"), ), max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) retrieval_date = (datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None)) retrieval_outside_max_age_date = retrieval_date + timedelta(1) event_date = retrieval_date - timedelta(2) creation_date = retrieval_date - timedelta(1) customers = [1001, 1002, 1003, 1004, 1005] daily_transactions = [np.random.rand() * 10 for _ in customers] total_transactions = [np.random.rand() * 100 for _ in customers] transactions_df = pd.DataFrame({ "event_timestamp": [event_date for _ in customers], "created_timestamp": [creation_date for _ in customers], "user_id": customers, "daily_transactions": daily_transactions, "total_transactions": total_transactions, }) feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] customer_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, }) job = feast_client.get_historical_features(feature_refs, customer_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, "transactions__daily_transactions": daily_transactions + [None] * len(customers), }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), )