def test_list_entities_and_features(client): customer_entity = Entity("customer_id", ValueType.INT64) driver_entity = Entity("driver_id", ValueType.INT64) customer_feature_rating = Feature(name="rating", dtype=ValueType.FLOAT, labels={"key1":"val1"}) customer_feature_cost = Feature(name="cost", dtype=ValueType.FLOAT) driver_feature_rating = Feature(name="rating", dtype=ValueType.FLOAT) driver_feature_cost = Feature(name="cost", dtype=ValueType.FLOAT, labels={"key1":"val1"}) filter_by_project_entity_labels_expected = dict([ ("customer:rating", customer_feature_rating) ]) filter_by_project_entity_expected = dict([ ("driver:cost", driver_feature_cost), ("driver:rating", driver_feature_rating) ]) filter_by_project_labels_expected = dict([ ("customer:rating", customer_feature_rating), ("driver:cost", driver_feature_cost) ]) customer_fs = FeatureSet( "customer", features=[ customer_feature_rating, customer_feature_cost ], entities=[customer_entity], max_age=Duration(seconds=100) ) driver_fs = FeatureSet( "driver", features=[ driver_feature_rating, driver_feature_cost ], entities=[driver_entity], max_age=Duration(seconds=100) ) client.set_project(PROJECT_NAME) client.apply(customer_fs) client.apply(driver_fs) # Test for listing of features # Case 1: Filter by: project, entities and labels filter_by_project_entity_labels_actual = client.list_features_by_ref(project=PROJECT_NAME, entities=["customer_id"], labels={"key1":"val1"}) # Case 2: Filter by: project, entities filter_by_project_entity_actual = client.list_features_by_ref(project=PROJECT_NAME, entities=["driver_id"]) # Case 3: Filter by: project, labels filter_by_project_labels_actual = client.list_features_by_ref(project=PROJECT_NAME, labels={"key1":"val1"}) assert set(filter_by_project_entity_labels_expected) == set(filter_by_project_entity_labels_actual) assert set(filter_by_project_entity_expected) == set(filter_by_project_entity_actual) assert set(filter_by_project_labels_expected) == set(filter_by_project_labels_actual)
def test_apply_feature_set_success(self, client): # Create Feature Sets fs1 = FeatureSet("my-feature-set-1") fs1.add(Feature(name="fs1-my-feature-1", dtype=ValueType.INT64)) fs1.add(Feature(name="fs1-my-feature-2", dtype=ValueType.STRING)) fs1.add(Entity(name="fs1-my-entity-1", dtype=ValueType.INT64)) fs2 = FeatureSet("my-feature-set-2") fs2.add(Feature(name="fs2-my-feature-1", dtype=ValueType.STRING_LIST)) fs2.add(Feature(name="fs2-my-feature-2", dtype=ValueType.BYTES_LIST)) fs2.add(Entity(name="fs2-my-entity-1", dtype=ValueType.INT64)) # Register Feature Set with Core client.apply(fs1) client.apply(fs2) feature_sets = client.list_feature_sets() # List Feature Sets assert ( len(feature_sets) == 2 and feature_sets[0].name == "my-feature-set-1" and feature_sets[0].features[0].name == "fs1-my-feature-1" and feature_sets[0].features[0].dtype == ValueType.INT64 and feature_sets[1].features[1].dtype == ValueType.BYTES_LIST )
def test_apply_object_and_read(test_feature_store): assert isinstance(test_feature_store, FeatureStore) # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) e1 = Entity(name="fs1_my_entity_1", value_type=ValueType.STRING, description="something") e2 = Entity(name="fs1_my_entity_2", value_type=ValueType.STRING, description="something") fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) fv2 = FeatureView( name="my_feature_view_2", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e1, fv2, e2]) fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") e1_actual = test_feature_store.get_entity("fs1_my_entity_1") assert fv1 == fv1_actual assert e1 == e1_actual assert fv2 != fv1_actual assert e2 != e1_actual test_feature_store.teardown()
def test_multiple_featureset_joins(client): fs1 = FeatureSet( "feature_set_1", features=[Feature("feature_value", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) fs2 = FeatureSet( "feature_set_2", features=[Feature("other_feature_value", ValueType.INT64)], entities=[Entity("other_entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(fs1) time.sleep(10) fs1 = client.get_feature_set(name="feature_set_1", version=1) client.apply(fs2) time.sleep(10) fs2 = client.get_feature_set(name="feature_set_2", version=1) N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": [f"{i}" for i in range(N_ROWS)], }) client.ingest(fs1, features_1_df) features_2_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "other_entity_id": [i for i in range(N_ROWS)], "other_feature_value": [i for i in range(N_ROWS)], }) client.ingest(fs2, features_2_df) entity_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "other_entity_id": [N_ROWS - 1 - i for i in range(N_ROWS)], }) feature_retrieval_job = client.get_batch_features( entity_rows=entity_df, feature_ids=[ "feature_set_1:1:feature_value", "feature_set_2:1:other_feature_value" ]) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["feature_set_1_v1_feature_value"].to_list() ] assert output["other_entity_id"].to_list( ) == output["feature_set_2_v1_other_feature_value"].to_list()
def test_basic_register_feature_set_success(client): # Register feature set without project cust_trans_fs_expected = FeatureSet.from_yaml( f"{DIR_PATH}/basic/cust_trans_fs.yaml") driver_fs_expected = FeatureSet.from_yaml( f"{DIR_PATH}/basic/driver_fs.yaml") client.apply(cust_trans_fs_expected) client.apply(driver_fs_expected) cust_trans_fs_actual = client.get_feature_set("customer_transactions") assert cust_trans_fs_actual == cust_trans_fs_expected driver_fs_actual = client.get_feature_set("driver") assert driver_fs_actual == driver_fs_expected # Register feature set with project cust_trans_fs_expected = FeatureSet.from_yaml( f"{DIR_PATH}/basic/cust_trans_fs.yaml") client.set_project(PROJECT_NAME) client.apply(cust_trans_fs_expected) cust_trans_fs_actual = client.get_feature_set("customer_transactions", project=PROJECT_NAME) assert cust_trans_fs_actual == cust_trans_fs_expected # Register feature set with labels driver_unlabelled_fs = FeatureSet( "driver_unlabelled", features=[ Feature("rating", ValueType.FLOAT), Feature("cost", ValueType.FLOAT) ], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) driver_labeled_fs_expected = FeatureSet( "driver_labeled", features=[ Feature("rating", ValueType.FLOAT), Feature("cost", ValueType.FLOAT) ], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), labels={"key1": "val1"}, ) client.set_project(PROJECT_NAME) client.apply(driver_unlabelled_fs) client.apply(driver_labeled_fs_expected) driver_fs_actual = client.list_feature_sets(project=PROJECT_NAME, labels={"key1": "val1"})[0] assert driver_fs_actual == driver_labeled_fs_expected # reset client's project for other tests client.set_project()
def test_feature_set_types_success(self, client, dataframe, mocker): all_types_fs = FeatureSet( name="all_types", entities=[Entity(name="user_id", dtype=ValueType.INT64)], features=[ Feature(name="float_feature", dtype=ValueType.FLOAT), Feature(name="int64_feature", dtype=ValueType.INT64), Feature(name="int32_feature", dtype=ValueType.INT32), Feature(name="string_feature", dtype=ValueType.STRING), Feature(name="bytes_feature", dtype=ValueType.BYTES), Feature(name="bool_feature", dtype=ValueType.BOOL), Feature(name="double_feature", dtype=ValueType.DOUBLE), Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST), Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), ], max_age=Duration(seconds=3600), ) # Register with Feast core client.apply(all_types_fs) mocker.patch.object( client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse(feature_set=all_types_fs.to_proto()), ) # Ingest data into Feast client.ingest(all_types_fs, dataframe=dataframe)
def test_apply_entity_success(test_registry): entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) project = "project" # Register Entity test_registry.apply_entity(entity, project) entities = test_registry.list_entities(project) entity = entities[0] assert (len(entities) == 1 and entity.name == "driver_car_id" and entity.value_type == ValueType(ValueProto.ValueType.STRING) and entity.description == "Car driver id" and "team" in entity.labels and entity.labels["team"] == "matchmaking") entity = test_registry.get_entity("driver_car_id", project) assert (entity.name == "driver_car_id" and entity.value_type == ValueType(ValueProto.ValueType.STRING) and entity.description == "Car driver id" and "team" in entity.labels and entity.labels["team"] == "matchmaking") test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def test_register_feature_set(self, sqlite_store): fs = FeatureSet("my-feature-set") fs.add(Feature(name="my-feature-1", dtype=ValueType.INT64)) fs.add(Feature(name="my-feature-2", dtype=ValueType.INT64)) fs.add(Entity(name="my-entity-1", dtype=ValueType.INT64)) fs._version = 1 feature_set_spec_proto = fs.to_proto().spec sqlite_store.register_feature_set(feature_set_spec_proto) feature_row = FeatureRowProto.FeatureRow( feature_set="feature_set_1", event_timestamp=Timestamp(), fields=[ FieldProto.Field( name="feature_1", value=ValueProto.Value(float_val=1.2) ), FieldProto.Field( name="feature_2", value=ValueProto.Value(float_val=1.2) ), FieldProto.Field( name="feature_3", value=ValueProto.Value(float_val=1.2) ), ], ) # sqlite_store.upsert_feature_row(feature_set_proto, feature_row) assert True
def test_update_featureset_apply_featureset_and_ingest_first_subset( client, update_featureset_dataframe): subset_columns = [ "datetime", "entity_id", "update_feature1", "update_feature2" ] subset_df = update_featureset_dataframe.iloc[:5][subset_columns] update_fs = FeatureSet( "update_fs", entities=[Entity(name="entity_id", dtype=ValueType.INT64)], max_age=Duration(seconds=432000), ) update_fs.infer_fields_from_df(subset_df) client.apply(update_fs) client.ingest(feature_set=update_fs, source=subset_df) time.sleep(15) feature_retrieval_job = client.get_batch_features( entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5], feature_refs=[ f"{PROJECT_NAME}/update_feature1", f"{PROJECT_NAME}/update_feature2", ], ) output = feature_retrieval_job.to_dataframe().sort_values(by=["entity_id"]) print(output.head()) assert output["update_feature1"].to_list( ) == subset_df["update_feature1"].to_list() assert output["update_feature2"].to_list( ) == subset_df["update_feature2"].to_list()
def driver_entity(): return Entity( name="driver_id", description="Driver entity for car rides", value_type=ValueType.STRING, labels={"team": "matchmaking", "common_key": "common_val"}, )
def customer_entity(): return Entity( name="customer_id", description="Customer entity for rides", value_type=ValueType.STRING, labels={"team": "customer_service", "common_key": "common_val"}, )
def test_get_column_names_preserves_feature_ordering(): entity = Entity("my-entity", description="My entity", value_type=ValueType.STRING) fv = FeatureView( name="my-fv", entities=["my-entity"], ttl=timedelta(days=1), batch_source=BigQuerySource(table="non-existent-mock"), schema=[ Field(name="a", dtype=String), Field(name="b", dtype=String), Field(name="c", dtype=String), Field(name="d", dtype=String), Field(name="e", dtype=String), Field(name="f", dtype=String), Field(name="g", dtype=String), Field(name="h", dtype=String), Field(name="i", dtype=String), Field(name="j", dtype=String), ], ) _, feature_list, _, _ = _get_column_names(fv, [entity]) assert feature_list == ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
def test_feature_set_ingest_success(self, dataframe, client, mocker): client.set_project("project1") driver_fs = FeatureSet("driver-feature-set", source=KafkaSource(brokers="kafka:9092", topic="test")) driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) # Register with Feast core client.apply(driver_fs) driver_fs = driver_fs.to_proto() driver_fs.meta.status = FeatureSetStatusProto.STATUS_READY mocker.patch.object( client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse(feature_set=driver_fs), ) # Need to create a mock producer with patch("feast.client.get_producer") as mocked_queue: # Ingest data into Feast client.ingest("driver-feature-set", dataframe)
def test_update_featureset_update_featureset_and_ingest_second_subset( client, update_featureset_dataframe): subset_columns = [ "datetime", "entity_id", "update_feature1", "update_feature3", "update_feature4", ] subset_df = update_featureset_dataframe.iloc[5:][subset_columns] update_fs = FeatureSet( "update_fs", entities=[Entity(name="entity_id", dtype=ValueType.INT64)], max_age=Duration(seconds=432000), ) update_fs.infer_fields_from_df(subset_df) client.apply(update_fs) # We keep retrying this ingestion until all values make it into the buffer. # This is a necessary step because bigquery streaming caches table schemas # and as a result, rows may be lost. while True: ingestion_id = client.ingest(feature_set=update_fs, source=subset_df) time.sleep(15) # wait for rows to get written to bq rows_ingested = get_rows_ingested(client, update_fs, ingestion_id) if rows_ingested == len(subset_df): print( f"Number of rows successfully ingested: {rows_ingested}. Continuing." ) break print( f"Number of rows successfully ingested: {rows_ingested}. Retrying ingestion." ) time.sleep(30) def check(): feature_retrieval_job = client.get_batch_features( entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[5:], feature_refs=[ "update_feature1", "update_feature3", "update_feature4", ], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe( timeout_sec=180).sort_values(by=["entity_id"]) print(output.head()) assert output["update_feature1"].to_list( ) == subset_df["update_feature1"].to_list() assert output["update_feature3"].to_list( ) == subset_df["update_feature3"].to_list() assert output["update_feature4"].to_list( ) == subset_df["update_feature4"].to_list() clean_up_remote_files(feature_retrieval_job.get_avro_files()) wait_for(check, timedelta(minutes=5))
def test_feature_set_ingest_success(self, dataframe, client, mocker): driver_fs = FeatureSet("driver-feature-set") driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) driver_fs.source = KafkaSource(topic="feature-topic", brokers="127.0.0.1") client._message_producer = MagicMock() client._message_producer.produce = MagicMock() # Register with Feast core client.apply(driver_fs) mocker.patch.object( client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse( feature_set=driver_fs.to_proto()), ) # Ingest data into Feast client.ingest("driver-feature-set", dataframe=dataframe)
def test_apply_entity_integration(test_feature_store): entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) # Register Entity test_feature_store.apply([entity]) entities = test_feature_store.list_entities() entity = entities[0] assert (len(entities) == 1 and entity.name == "driver_car_id" and entity.value_type == ValueType(ValueProto.ValueType.STRING) and entity.description == "Car driver id" and "team" in entity.labels and entity.labels["team"] == "matchmaking") entity = test_feature_store.get_entity("driver_car_id") assert (entity.name == "driver_car_id" and entity.value_type == ValueType(ValueProto.ValueType.STRING) and entity.description == "Car driver id" and "team" in entity.labels and entity.labels["team"] == "matchmaking") test_feature_store.teardown()
def test_apply_entity_integration(self, test_client): entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) # Register Entity with Core test_client.apply(entity) entities = test_client.list_entities() entity = entities[0] assert (len(entities) == 1 and entity.name == "driver_car_id" and entity.value_type == ValueType(ValueProto.ValueType.STRING) and entity.description == "Car driver id" and "team" in entity.labels and entity.labels["team"] == "matchmaking") entity = test_client.get_entity("driver_car_id") assert (entity.name == "driver_car_id" and entity.value_type == ValueType(ValueProto.ValueType.STRING) and entity.description == "Car driver id" and "team" in entity.labels and entity.labels["team"] == "matchmaking")
def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={"ts_1": "ts", "id": "driver_id"}, ) fv = get_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db") ), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv
def test_order_by_creation_time(client): proc_time_fs = FeatureSet( "processing_time", features=[Feature("feature_value", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(proc_time_fs) time.sleep(10) proc_time_fs = client.get_feature_set(name="processing_time", version=1) time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) N_ROWS = 10 incorrect_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": ["WRONG"] * N_ROWS, }) correct_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": ["CORRECT"] * N_ROWS, }) client.ingest(proc_time_fs, incorrect_df) time.sleep(10) client.ingest(proc_time_fs, correct_df) feature_retrieval_job = client.get_batch_features( entity_rows=incorrect_df[["datetime", "entity_id"]], feature_ids=["processing_time:1:feature_value"]) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["processing_time_v1_feature_value"].to_list() == ["CORRECT" ] * N_ROWS
def alltypes_entity(): return Entity( name="alltypes_id", description="Driver entity for car rides", value_type=ValueType.STRING, labels={"cat": "alltypes"}, )
def test_feature_set_ingest_fail_if_pending(self, dataframe, exception, test_client, mocker): with pytest.raises(exception): test_client.set_project("project1") driver_fs = FeatureSet( "driver-feature-set", source=KafkaSource(brokers="kafka:9092", topic="test"), ) driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) # Register with Feast core test_client.apply(driver_fs) driver_fs = driver_fs.to_proto() driver_fs.meta.status = FeatureSetStatusProto.STATUS_PENDING mocker.patch.object( test_client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse(feature_set=driver_fs), ) # Need to create a mock producer with patch("feast.client.get_producer"): # Ingest data into Feast test_client.ingest("driver-feature-set", dataframe, timeout=1)
def test_feature_set_ingest_throws_exception_if_kafka_down( self, dataframe, test_client, exception, mocker): test_client.set_project("project1") driver_fs = FeatureSet( "driver-feature-set", source=KafkaSource(brokers="localhost:4412", topic="test"), ) driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) # Register with Feast core test_client.apply(driver_fs) driver_fs = driver_fs.to_proto() driver_fs.meta.status = FeatureSetStatusProto.STATUS_READY mocker.patch.object( test_client._core_service_stub, "GetFeatureSet", return_value=GetFeatureSetResponse(feature_set=driver_fs), ) with pytest.raises(exception): test_client.ingest("driver-feature-set", dataframe)
def test_entity_without_tags_empty_dict(): with pytest.deprecated_call(): entity = Entity("my-entity", description="My entity", value_type=ValueType.STRING) assert entity.tags == dict() assert len(entity.tags) == 0
def test_entity_class_contains_labels(): entity = Entity( "my-entity", description="My entity", value_type=ValueType.STRING, labels={"key1": "val1", "key2": "val2"}, ) assert "key1" in entity.labels.keys() and entity.labels["key1"] == "val1" assert "key2" in entity.labels.keys() and entity.labels["key2"] == "val2"
def test_update_from_source_success(self, dataframe): fs = FeatureSet("driver-feature-set") fs.update_from_dataset( dataframe, column_mapping={ "entity_id": Entity(name="entity", dtype=ValueType.INT64) }, ) assert len(fs.features) == 3 and fs.features[1].name == "feature_2"
def test_hash(): entity1 = Entity(name="my-entity", value_type=ValueType.STRING) entity2 = Entity(name="my-entity", value_type=ValueType.STRING) entity3 = Entity(name="my-entity", value_type=ValueType.FLOAT) entity4 = Entity(name="my-entity", value_type=ValueType.FLOAT, description="test") s1 = {entity1, entity2} assert len(s1) == 1 s2 = {entity1, entity3} assert len(s2) == 2 s3 = {entity3, entity4} assert len(s3) == 2 s4 = {entity1, entity2, entity3, entity4} assert len(s4) == 3
def test_apply_all_featuresets(client): client.set_project(PROJECT_NAME) file_fs1 = FeatureSet( "file_feature_set", features=[Feature("feature_value1", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(file_fs1) gcs_fs1 = FeatureSet( "gcs_feature_set", features=[Feature("feature_value2", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(gcs_fs1) proc_time_fs = FeatureSet( "processing_time", features=[Feature("feature_value3", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(proc_time_fs) add_cols_fs = FeatureSet( "additional_columns", features=[Feature("feature_value4", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(add_cols_fs) historical_fs = FeatureSet( "historical", features=[Feature("feature_value5", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(historical_fs) fs1 = FeatureSet( "feature_set_1", features=[Feature("feature_value6", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) fs2 = FeatureSet( "feature_set_2", features=[Feature("other_feature_value7", ValueType.INT64)], entities=[Entity("other_entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(fs1) client.apply(fs2)
def prep_bq_fs_and_fv( bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = bigquery.Client() gcp_project = client.project bigquery_dataset = "test_ingestion" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14 ) # 2 weeks in milliseconds client.update_dataset(dataset, ["default_table_expiration_ms"]) df = create_dataset() job_config = bigquery.LoadJobConfig() table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}" query = f"SELECT * FROM `{table_ref}`" job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) job.result() bigquery_source = BigQuerySource( table_ref=table_ref if bq_source_type == "table" else None, query=query if bq_source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(bigquery_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="gcp", online_store=DatastoreOnlineStoreConfig( namespace="integration_test"), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def test_reapply_feature_view_success(test_feature_store, dataframe_source): with prep_file_source(df=dataframe_source, event_timestamp_column="ts_1") as file_source: e = Entity(name="id", join_keys=["id_join_key"], value_type=ValueType.STRING) # Create Feature View fv1 = FeatureView( name="my_feature_view_1", schema=[Field(name="string_col", dtype=String)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 0 # Run materialization test_feature_store.materialize(datetime(2020, 1, 1), datetime(2021, 1, 1)) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 1 # Apply again test_feature_store.apply([fv1]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 1 # Change and apply Feature View fv1 = FeatureView( name="my_feature_view_1", schema=[Field(name="int64_col", dtype=Int64)], entities=["id"], batch_source=file_source, ttl=timedelta(minutes=5), ) test_feature_store.apply([fv1]) # Check Feature View fv_stored = test_feature_store.get_feature_view(fv1.name) assert len(fv_stored.materialization_intervals) == 0 test_feature_store.teardown()
def test_entity_class_contains_tags(): with pytest.deprecated_call(): entity = Entity( "my-entity", description="My entity", value_type=ValueType.STRING, tags={ "key1": "val1", "key2": "val2" }, ) assert "key1" in entity.tags.keys() and entity.tags["key1"] == "val1" assert "key2" in entity.tags.keys() and entity.tags["key2"] == "val2"