def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = get_feature_view(file_source) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project= f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=OnlineStoreConfig(local=LocalOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db"))), ) fs = FeatureStore(config=config) fs.apply([fv]) yield fs, fv
def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource: """ Helper function to upload a pandas dataframe in parquet format to a temporary location (under SPARK_STAGING_LOCATION) and return it wrapped in a FileSource. Args: event_timestamp_column(str): the name of the timestamp column in the dataframe. config(Config): feast config. """ staging_location = config.get(opt.SPARK_STAGING_LOCATION) staging_uri = urlparse(staging_location) with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = urlunparse( get_staging_client(staging_uri.scheme, config).upload_fileobj( f, f.name, remote_path_prefix=os.path.join(staging_location, "dataframes"), remote_path_suffix=".parquet", ) ) return FileSource( event_timestamp_column=event_timestamp_column, file_format=ParquetFormat(), file_url=file_url, )
def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={"ts_1": "ts", "id": "driver_id"}, ) fv = get_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="aws", online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), offline_store=FileOfflineStoreConfig(), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv
def create_data_source( self, df: pd.DataFrame, destination_name: Optional[str] = None, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: filename = f"{destination_name}.parquet" port = self.minio.get_exposed_port("9000") host = self.minio.get_container_host_ip() minio_endpoint = f"{host}:{port}" self._upload_parquet_file(df, filename, minio_endpoint) return FileSource( file_format=ParquetFormat(), path=f"s3://{self.bucket}/{filename}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, s3_endpoint_override=f"http://{host}:{port}", )
def _ingest_test_getfeaturetable_mocked_resp(file_url: str, date_partition_col: str = ""): return GetFeatureTableResponse(table=FeatureTableProto( spec=FeatureTableSpecProto( name="ingest_featuretable", max_age=Duration(seconds=3600), features=[ FeatureSpecProto( name="dev_feature_float", value_type=ValueProto.ValueType.FLOAT, ), FeatureSpecProto( name="dev_feature_string", value_type=ValueProto.ValueType.STRING, ), ], entities=["dev_entity"], batch_source=DataSourceProto( file_options=DataSourceProto.FileOptions( file_format=ParquetFormat().to_proto(), file_url=file_url), event_timestamp_column="datetime", created_timestamp_column="timestamp", date_partition_column=date_partition_col, ), ), meta=FeatureTableMetaProto(), ))
def alltypes_featuretable(): batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) return FeatureTable( name="alltypes", entities=["alltypes_id"], features=[ Feature(name="float_feature", dtype=ValueType.FLOAT), Feature(name="int64_feature", dtype=ValueType.INT64), Feature(name="int32_feature", dtype=ValueType.INT32), Feature(name="string_feature", dtype=ValueType.STRING), Feature(name="bytes_feature", dtype=ValueType.BYTES), Feature(name="bool_feature", dtype=ValueType.BOOL), Feature(name="double_feature", dtype=ValueType.DOUBLE), Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST), ], max_age=Duration(seconds=3600), batch_source=batch_source, labels={"cat": "alltypes"}, )
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str, config: Config) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme, config) with tempfile.NamedTemporaryFile() as df_export_path: # prevent casting ns -> ms exception inside pyarrow entity_source["event_timestamp"] = entity_source[ "event_timestamp"].dt.floor("ms") entity_source.to_parquet(df_export_path.name) with open(df_export_path.name, "rb") as f: staging_client.upload_fileobj(f, df_export_path.name, remote_uri=entity_staging_uri) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )
def create_data_source( self, df: pd.DataFrame, destination_name: str, timestamp_field="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: destination_name = self.get_prefixed_table_name(destination_name) f = tempfile.NamedTemporaryFile( prefix=f"{self.project_name}_{destination_name}", suffix=".parquet", delete=False, ) df.to_parquet(f.name) self.files.append(f) return FileSource( file_format=ParquetFormat(), path=f"{f.name}", timestamp_field=timestamp_field, created_timestamp_column=created_timestamp_column, field_mapping=field_mapping or {"ts_1": "ts"}, )
def test_apply_feature_view_integration(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == ValueType.INT64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == ValueType.STRING and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == ValueType.STRING_LIST and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == ValueType.BYTES_LIST and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_feature_store.get_feature_view("my_feature_view_1") assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == ValueType.INT64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == ValueType.STRING and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == ValueType.STRING_LIST and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == ValueType.BYTES_LIST and feature_view.entities[0] == "fs1_my_entity_1") test_feature_store.delete_feature_view("my_feature_view_1") feature_views = test_feature_store.list_feature_views() assert len(feature_views) == 0 test_feature_store.teardown()
def test_apply_feature_view_integration(test_feature_store): # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) fv1 = FeatureView( name="my_feature_view_1", schema=[ Field(name="fs1_my_feature_1", dtype=Int64), Field(name="fs1_my_feature_2", dtype=String), Field(name="fs1_my_feature_3", dtype=Array(String)), Field(name="fs1_my_feature_4", dtype=Array(Bytes)), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1]) feature_views = test_feature_store.list_feature_views() # List Feature Views assert (len(feature_views) == 1 and feature_views[0].name == "my_feature_view_1" and feature_views[0].features[0].name == "fs1_my_feature_1" and feature_views[0].features[0].dtype == Int64 and feature_views[0].features[1].name == "fs1_my_feature_2" and feature_views[0].features[1].dtype == String and feature_views[0].features[2].name == "fs1_my_feature_3" and feature_views[0].features[2].dtype == Array(String) and feature_views[0].features[3].name == "fs1_my_feature_4" and feature_views[0].features[3].dtype == Array(Bytes) and feature_views[0].entities[0] == "fs1_my_entity_1") feature_view = test_feature_store.get_feature_view("my_feature_view_1") assert (feature_view.name == "my_feature_view_1" and feature_view.features[0].name == "fs1_my_feature_1" and feature_view.features[0].dtype == Int64 and feature_view.features[1].name == "fs1_my_feature_2" and feature_view.features[1].dtype == String and feature_view.features[2].name == "fs1_my_feature_3" and feature_view.features[2].dtype == Array(String) and feature_view.features[3].name == "fs1_my_feature_4" and feature_view.features[3].dtype == Array(Bytes) and feature_view.entities[0] == "fs1_my_entity_1") test_feature_store.delete_feature_view("my_feature_view_1") feature_views = test_feature_store.list_feature_views() assert len(feature_views) == 0 test_feature_store.teardown()
def test_apply_object_and_read(test_feature_store): assert isinstance(test_feature_store, FeatureStore) # Create Feature Views batch_source = FileSource( file_format=ParquetFormat(), path="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) e1 = Entity(name="fs1_my_entity_1", value_type=ValueType.STRING, description="something") e2 = Entity(name="fs1_my_entity_2", value_type=ValueType.STRING, description="something") fv1 = FeatureView( name="my_feature_view_1", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) fv2 = FeatureView( name="my_feature_view_2", features=[ Feature(name="fs1_my_feature_1", dtype=ValueType.INT64), Feature(name="fs1_my_feature_2", dtype=ValueType.STRING), Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST), Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) # Register Feature View test_feature_store.apply([fv1, e1, fv2, e2]) fv1_actual = test_feature_store.get_feature_view("my_feature_view_1") e1_actual = test_feature_store.get_entity("fs1_my_entity_1") assert fv1 == fv1_actual assert e1 == e1_actual assert fv2 != fv1_actual assert e2 != e1_actual test_feature_store.teardown()
def transactions_feature_table(spark, client): schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_transactions", DoubleType()), StructField("is_vip", BooleanType()), ]) df_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 50.0, True, ), ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 100.0, True, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 400.0, False, ), ( 1001, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=1), 200.0, False, ), ( 1001, datetime(year=2020, month=9, day=4), datetime(year=2020, month=9, day=1), 300.0, False, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "transactions", schema, df_data) file_source = FileSource("event_timestamp", "created_timestamp", ParquetFormat(), file_uri) features = [ Feature("total_transactions", ValueType.DOUBLE), Feature("is_vip", ValueType.BOOL), ] feature_table = FeatureTable("transactions", ["customer_id"], features, batch_source=file_source) yield client.apply_feature_table(feature_table) shutil.rmtree(temp_dir)
def test_apply_data_source(test_registry: Registry): # Create Feature Views batch_source = FileSource( name="test_source", file_format=ParquetFormat(), path="file://feast/*", timestamp_field="ts_col", created_timestamp_column="timestamp", ) fv1 = FeatureView( name="my_feature_view_1", schema=[ Field(name="fs1_my_feature_1", dtype=Int64), Field(name="fs1_my_feature_2", dtype=String), Field(name="fs1_my_feature_3", dtype=Array(String)), Field(name="fs1_my_feature_4", dtype=Array(Bytes)), ], entities=["fs1_my_entity_1"], tags={"team": "matchmaking"}, batch_source=batch_source, ttl=timedelta(minutes=5), ) project = "project" # Register data source and feature view test_registry.apply_data_source(batch_source, project, commit=False) test_registry.apply_feature_view(fv1, project, commit=True) registry_feature_views = test_registry.list_feature_views(project) registry_data_sources = test_registry.list_data_sources(project) assert len(registry_feature_views) == 1 assert len(registry_data_sources) == 1 registry_feature_view = registry_feature_views[0] assert registry_feature_view.batch_source == batch_source registry_data_source = registry_data_sources[0] assert registry_data_source == batch_source # Check that change to batch source propagates batch_source.timestamp_field = "new_ts_col" test_registry.apply_data_source(batch_source, project, commit=False) test_registry.apply_feature_view(fv1, project, commit=True) registry_feature_views = test_registry.list_feature_views(project) registry_data_sources = test_registry.list_data_sources(project) assert len(registry_feature_views) == 1 assert len(registry_data_sources) == 1 registry_feature_view = registry_feature_views[0] assert registry_feature_view.batch_source == batch_source registry_batch_source = test_registry.list_data_sources(project)[0] assert registry_batch_source == batch_source test_registry.teardown() # Will try to reload registry, which will fail because the file has been deleted with pytest.raises(FileNotFoundError): test_registry._get_registry_proto()
def create_saved_dataset_destination(self) -> SavedDatasetFileStorage: port = self.minio.get_exposed_port("9000") host = self.minio.get_container_host_ip() return SavedDatasetFileStorage( path=f"s3://{self.bucket}/persisted/{str(uuid.uuid4())}", file_format=ParquetFormat(), s3_endpoint_override=f"http://{host}:{port}", )
def prep_file_source(df, event_timestamp_column=None) -> FileSource: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), file_url=f.name, event_timestamp_column=event_timestamp_column, ) yield file_source
def test_apply_feature_table_success(self, test_client): test_client.set_project("project1") # Create Feature Tables batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( bootstrap_servers="localhost:9094", message_format=ProtoFormat("class.path"), topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) ft1 = FeatureTable( name="my-feature-table-1", features=[ Feature(name="fs1-my-feature-1", dtype=ValueType.INT64), Feature(name="fs1-my-feature-2", dtype=ValueType.STRING), Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST), Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST), ], entities=["fs1-my-entity-1"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Register Feature Table with Core test_client.apply_feature_table(ft1) feature_tables = test_client.list_feature_tables() # List Feature Tables assert (len(feature_tables) == 1 and feature_tables[0].name == "my-feature-table-1" and feature_tables[0].features[0].name == "fs1-my-feature-1" and feature_tables[0].features[0].dtype == ValueType.INT64 and feature_tables[0].features[1].name == "fs1-my-feature-2" and feature_tables[0].features[1].dtype == ValueType.STRING and feature_tables[0].features[2].name == "fs1-my-feature-3" and feature_tables[0].features[2].dtype == ValueType.STRING_LIST and feature_tables[0].features[3].name == "fs1-my-feature-4" and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST and feature_tables[0].entities[0] == "fs1-my-entity-1")
def batch_source(self): return FileSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", )
def test_offline_ingestion(feast_client: Client, staging_path: str): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name="drivers", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( "event_timestamp", "event_timestamp", ParquetFormat(), os.path.join(staging_path, "batch-storage"), ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) original = generate_data() feast_client.ingest(feature_table, original) # write to batch (offline) storage job = feast_client.start_offline_to_online_ingestion( feature_table, datetime.today(), datetime.today() + timedelta(days=1)) status = wait_retry_backoff( lambda: (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS), 300) assert status == SparkJobStatus.COMPLETED features = feast_client.get_online_features( ["drivers:unique_drivers"], entity_rows=[{ "s2id": s2_id } for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", "drivers:unique_drivers"]], original[[ "s2id", "unique_drivers" ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}), )
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client, feast_spark_client: SparkClient): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) batch_source = FileSource( file_format=ParquetFormat(), file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) feature_table = FeatureTable( name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"), entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "0 0 * * *") config.load_incluster_config() k8s_api = client.CustomObjectsApi() def get_scheduled_spark_application(): job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}". encode()).hexdigest() resource_name = f"feast-{job_hash}" return k8s_api.get_namespaced_custom_object( group="sparkoperator.k8s.io", version="v1beta2", namespace=pytestconfig.getoption("k8s_namespace"), plural="scheduledsparkapplications", name=resource_name, ) response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "0 0 * * *" feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "1 0 * * *") response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "1 0 * * *" feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
def stage_dataframe(self, df: pandas.DataFrame, event_timestamp: str) -> FileSource: with tempfile.NamedTemporaryFile() as f: df.to_parquet(f) file_url = _s3_upload( f, f.name, remote_path_prefix=os.path.join(self._staging_location, "dataframes"), remote_path_suffix=".parquet", ) return FileSource( event_timestamp_column=event_timestamp, file_format=ParquetFormat(), file_url=file_url, )
def test_feature_table_import_export_yaml(self): batch_source = FileSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, bootstrap_servers="localhost:9094", message_format=ProtoFormat(class_path="class.path"), topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) test_feature_table = FeatureTable( name="car_driver", features=[ Feature(name="ride_distance", dtype=ValueType.FLOAT), Feature(name="ride_duration", dtype=ValueType.STRING), ], entities=["car_driver_entity"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Create a string YAML representation of the feature table string_yaml = test_feature_table.to_yaml() # Create a new feature table object from the YAML string actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml) # Ensure equality is upheld to original feature table assert test_feature_table == actual_feature_table_from_string
def bookings_feature_table_with_mapping(spark, client): schema = StructType([ StructField("id", IntegerType()), StructField("datetime", TimestampType()), StructField("created_datetime", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 100, ), ( 8001, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 150, ), ( 8002, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource( event_timestamp_column="datetime", created_timestamp_column="created_datetime", file_format=ParquetFormat(), file_url=file_uri, field_mapping={"id": "driver_id"}, ) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply(feature_table) shutil.rmtree(temp_dir)
def create_data_source( self, destination: str, df: pd.DataFrame, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: self.f = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) df.to_parquet(self.f.name) return FileSource( file_format=ParquetFormat(), path=f"file://{self.f.name}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, )
def batch_source(local_staging_path: str, pytestconfig, request: FixtureRequest): if pytestconfig.getoption("env") == "gcloud": bq_project = pytestconfig.getoption("bq_project") bq_dataset = request.getfixturevalue("bq_dataset") return BigQuerySource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", table_ref= f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}", ) else: return FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "transactions"), )
def create_schema(kafka_broker, topic_name, feature_table_name): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) feature_table = FeatureTable( name=feature_table_name, entities=["key"], features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) return entity, feature_table
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: with tempfile.NamedTemporaryFile(suffix=".parquet") as f: df = create_dataset() f.close() df.to_parquet(f.name) file_source = FileSource( file_format=ParquetFormat(), path=f"file://{f.name}", event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(file_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}" print(f"Using project: {project}") with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider="local", online_store=RedisOnlineStoreConfig( type="redis", redis_type=RedisType.redis, connection_string="localhost:6379,db=0", ), ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown()
def test_historical_feature_retrieval_with_field_mappings_from_local_spark_session( spark, client, driver_entity, bookings_feature_table_with_mapping, ): schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), ]) df_data = [ (8001, datetime(year=2020, month=9, day=1, tzinfo=utc)), (8001, datetime(year=2020, month=9, day=2, tzinfo=utc)), (8002, datetime(year=2020, month=9, day=1, tzinfo=utc)), ] temp_dir, file_uri = create_temp_parquet_file(spark, "drivers", schema, df_data) entity_source = FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=file_uri, ) joined_df = client.get_historical_features_df( ["bookings:total_completed_bookings"], entity_source, ) expected_joined_df_schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("bookings__total_completed_bookings", IntegerType()), ]) expected_joined_df_data = [ (8001, datetime(year=2020, month=9, day=1, tzinfo=utc), 100), (8001, datetime(year=2020, month=9, day=2, tzinfo=utc), 150), (8002, datetime(year=2020, month=9, day=1, tzinfo=utc), None), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_df_data), expected_joined_df_schema, ) assert_dataframe_equal(joined_df, expected_joined_df) shutil.rmtree(temp_dir)
def basic_featuretable(): batch_source = FileSource( field_mapping={ "dev_entity": "dev_entity_field", "dev_feature_float": "dev_feature_float_field", "dev_feature_string": "dev_feature_string_field", }, file_format=ParquetFormat(), file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) stream_source = KafkaSource( field_mapping={ "dev_entity": "dev_entity_field", "dev_feature_float": "dev_feature_float_field", "dev_feature_string": "dev_feature_string_field", }, bootstrap_servers="localhost:9094", message_format=ProtoFormat(class_path="class.path"), topic="test_topic", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", ) return FeatureTable( name="basic_featuretable", entities=["driver_id", "customer_id"], features=[ Feature(name="dev_feature_float", dtype=ValueType.FLOAT), Feature(name="dev_feature_string", dtype=ValueType.STRING), ], max_age=Duration(seconds=3600), batch_source=batch_source, stream_source=stream_source, labels={ "key1": "val1", "key2": "val2" }, )
def _create_ft(self, client: Client, features) -> None: entity = Entity( name="driver_car_id", description="Car driver id", value_type=ValueType.STRING, labels={"team": "matchmaking"}, ) # Register Entity with Core client.apply_entity(entity) # Create Feature Tables batch_source = FileSource( file_format=ParquetFormat(), file_url="file://feast/*", event_timestamp_column="ts_col", created_timestamp_column="timestamp", date_partition_column="date_partition_col", ) stream_source = KafkaSource( bootstrap_servers="localhost:9094", message_format=ProtoFormat("class.path"), topic="test_topic", event_timestamp_column="ts_col", created_timestamp_column="timestamp", ) ft1 = FeatureTable( name=self.table_name, features=features, entities=["driver_car_id"], labels={"team": "matchmaking"}, batch_source=batch_source, stream_source=stream_source, ) # Register Feature Table with Core client.apply_feature_table(ft1)
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str) -> FileSource: """ Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location) :return: FileSource with remote destination path """ entity_staging_uri = urlparse( os.path.join(staging_location, str(uuid.uuid4()))) staging_client = get_staging_client(entity_staging_uri.scheme) with tempfile.NamedTemporaryFile() as df_export_path: entity_source.to_parquet(df_export_path.name) bucket = (None if entity_staging_uri.scheme == "file" else entity_staging_uri.netloc) staging_client.upload_file(df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")) # ToDo: support custom event_timestamp_column return FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=entity_staging_uri.geturl(), )