Beispiel #1
0
def alltypes_featuretable():
    batch_source = FileSource(
        file_format="parquet",
        file_url="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )
    return FeatureTable(
        name="alltypes",
        entities=["alltypes_id"],
        features=[
            Feature(name="float_feature", dtype=ValueType.FLOAT),
            Feature(name="int64_feature", dtype=ValueType.INT64),
            Feature(name="int32_feature", dtype=ValueType.INT32),
            Feature(name="string_feature", dtype=ValueType.STRING),
            Feature(name="bytes_feature", dtype=ValueType.BYTES),
            Feature(name="bool_feature", dtype=ValueType.BOOL),
            Feature(name="double_feature", dtype=ValueType.DOUBLE),
            Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST),
            Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST),
            Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST),
            Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST),
            Feature(name="string_list_feature", dtype=ValueType.STRING_LIST),
            Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST),
            Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST),
        ],
        max_age=Duration(seconds=3600),
        batch_source=batch_source,
        labels={"cat": "alltypes"},
    )
Beispiel #2
0
def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            file_url=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={"ts_1": "ts", "id": "driver_id"},
        )
        fv = get_feature_view(file_source)
        e = Entity(
            name="driver",
            description="id for driver",
            join_key="driver_id",
            value_type=ValueType.INT32,
        )
        with tempfile.TemporaryDirectory() as repo_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
                provider="aws",
                online_store=DynamoDBOnlineStoreConfig(region="us-west-2"),
                offline_store=FileOfflineStoreConfig(),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv, e])

            yield fs, fv
Beispiel #3
0
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str,
                         config: Config) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme, config)
    with tempfile.NamedTemporaryFile() as df_export_path:
        # prevent casting ns -> ms exception inside pyarrow
        entity_source["event_timestamp"] = entity_source[
            "event_timestamp"].dt.floor("ms")

        entity_source.to_parquet(df_export_path.name)

        with open(df_export_path.name, "rb") as f:
            staging_client.upload_fileobj(f,
                                          df_export_path.name,
                                          remote_uri=entity_staging_uri)

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )
Beispiel #4
0
def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            file_url=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={
                "ts_1": "ts",
                "id": "driver_id"
            },
        )
        fv = get_feature_view(file_source)
        with tempfile.TemporaryDirectory(
        ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=
                f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
                provider="local",
                online_store=OnlineStoreConfig(local=LocalOnlineStoreConfig(
                    path=str(Path(data_dir_name) / "online_store.db"))),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv])

            yield fs, fv
Beispiel #5
0
def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource:
    """
    Helper function to upload a pandas dataframe in parquet format to a temporary location (under
    SPARK_STAGING_LOCATION) and return it wrapped in a FileSource.

    Args:
        event_timestamp_column(str): the name of the timestamp column in the dataframe.
        config(Config): feast config.
    """
    staging_location = config.get(opt.SPARK_STAGING_LOCATION)
    staging_uri = urlparse(staging_location)

    with tempfile.NamedTemporaryFile() as f:
        df.to_parquet(f)

        file_url = urlunparse(
            get_staging_client(staging_uri.scheme, config).upload_fileobj(
                f,
                f.name,
                remote_path_prefix=os.path.join(staging_location, "dataframes"),
                remote_path_suffix=".parquet",
            )
        )

    return FileSource(
        event_timestamp_column=event_timestamp_column,
        file_format=ParquetFormat(),
        file_url=file_url,
    )
def stage_customer_daily_profile_parquet_source(directory, df):
    customer_profile_path = os.path.join(directory, "customer_profile.parquet")
    df.to_parquet(path=customer_profile_path, allow_truncated_timestamps=True)
    return FileSource(
        path=customer_profile_path,
        event_timestamp_column="datetime",
        created_timestamp_column="created",
    )
Beispiel #7
0
    def test_apply_feature_view_integration(self, test_feature_store):

        # Create Feature Views
        batch_source = FileSource(
            file_format=ParquetFormat(),
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        fv1 = FeatureView(
            name="my_feature_view_1",
            features=[
                Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
                Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
                Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
                Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
            ],
            entities=["fs1_my_entity_1"],
            tags={"team": "matchmaking"},
            input=batch_source,
            ttl=timedelta(minutes=5),
        )

        # Register Feature View
        test_feature_store.apply([fv1])

        feature_views = test_feature_store.list_feature_views()

        # List Feature Views
        assert (len(feature_views) == 1
                and feature_views[0].name == "my_feature_view_1"
                and feature_views[0].features[0].name == "fs1_my_feature_1"
                and feature_views[0].features[0].dtype == ValueType.INT64
                and feature_views[0].features[1].name == "fs1_my_feature_2"
                and feature_views[0].features[1].dtype == ValueType.STRING
                and feature_views[0].features[2].name == "fs1_my_feature_3"
                and feature_views[0].features[2].dtype == ValueType.STRING_LIST
                and feature_views[0].features[3].name == "fs1_my_feature_4"
                and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
                and feature_views[0].entities[0] == "fs1_my_entity_1")

        feature_view = test_feature_store.get_feature_view("my_feature_view_1")
        assert (feature_view.name == "my_feature_view_1"
                and feature_view.features[0].name == "fs1_my_feature_1"
                and feature_view.features[0].dtype == ValueType.INT64
                and feature_view.features[1].name == "fs1_my_feature_2"
                and feature_view.features[1].dtype == ValueType.STRING
                and feature_view.features[2].name == "fs1_my_feature_3"
                and feature_view.features[2].dtype == ValueType.STRING_LIST
                and feature_view.features[3].name == "fs1_my_feature_4"
                and feature_view.features[3].dtype == ValueType.BYTES_LIST
                and feature_view.entities[0] == "fs1_my_entity_1")

        test_feature_store.delete_feature_view("my_feature_view_1")
        feature_views = test_feature_store.list_feature_views()
        assert len(feature_views) == 0
def stage_driver_hourly_stats_parquet_source(directory, df):
    # Write to disk
    driver_stats_path = os.path.join(directory, "driver_stats.parquet")
    df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True)
    return FileSource(
        path=driver_stats_path,
        event_timestamp_column="datetime",
        created_timestamp_column="",
    )
def test_apply_object_and_read(test_feature_store):
    assert isinstance(test_feature_store, FeatureStore)
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
    )

    e1 = Entity(name="fs1_my_entity_1",
                value_type=ValueType.STRING,
                description="something")

    e2 = Entity(name="fs1_my_entity_2",
                value_type=ValueType.STRING,
                description="something")

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        input=batch_source,
        ttl=timedelta(minutes=5),
    )

    fv2 = FeatureView(
        name="my_feature_view_2",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        input=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1, e1, fv2, e2])

    fv1_actual = test_feature_store.get_feature_view("my_feature_view_1")
    e1_actual = test_feature_store.get_entity("fs1_my_entity_1")

    assert fv1 == fv1_actual
    assert e1 == e1_actual
    assert fv2 != fv1_actual
    assert e2 != e1_actual
Beispiel #10
0
def prep_file_source(df, event_timestamp_column=None) -> FileSource:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            file_url=f.name,
            event_timestamp_column=event_timestamp_column,
        )
        yield file_source
Beispiel #11
0
    def test_apply_feature_table_success(self, test_client):

        test_client.set_project("project1")

        # Create Feature Tables
        batch_source = FileSource(
            file_format="parquet",
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            bootstrap_servers="localhost:9094",
            class_path="random/path/to/class",
            topic="test_topic",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
        )

        ft1 = FeatureTable(
            name="my-feature-table-1",
            features=[
                Feature(name="fs1-my-feature-1", dtype=ValueType.INT64),
                Feature(name="fs1-my-feature-2", dtype=ValueType.STRING),
                Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST),
                Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST),
            ],
            entities=["fs1-my-entity-1"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Register Feature Table with Core
        test_client.apply_feature_table(ft1)

        feature_tables = test_client.list_feature_tables()

        # List Feature Tables
        assert (
            len(feature_tables) == 1
            and feature_tables[0].name == "my-feature-table-1"
            and feature_tables[0].features[0].name == "fs1-my-feature-1"
            and feature_tables[0].features[0].dtype == ValueType.INT64
            and feature_tables[0].features[1].name == "fs1-my-feature-2"
            and feature_tables[0].features[1].dtype == ValueType.STRING
            and feature_tables[0].features[2].name == "fs1-my-feature-3"
            and feature_tables[0].features[2].dtype == ValueType.STRING_LIST
            and feature_tables[0].features[3].name == "fs1-my-feature-4"
            and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_tables[0].entities[0] == "fs1-my-entity-1"
        )
Beispiel #12
0
 def batch_source(self):
     return FileSource(
         field_mapping={
             "ride_distance": "ride_distance",
             "ride_duration": "ride_duration",
         },
         file_format=ParquetFormat(),
         file_url="file://feast/*",
         event_timestamp_column="ts_col",
         created_timestamp_column="timestamp",
         date_partition_column="date_partition_col",
     )
Beispiel #13
0
 def stage_dataframe(self, df: pandas.DataFrame,
                     event_timestamp: str) -> FileSource:
     with tempfile.NamedTemporaryFile() as f:
         df.to_parquet(f)
         file_url = _s3_upload(
             f,
             f.name,
             remote_path_prefix=os.path.join(self._staging_location,
                                             "dataframes"),
             remote_path_suffix=".parquet",
         )
     return FileSource(
         event_timestamp_column=event_timestamp,
         file_format=ParquetFormat(),
         file_url=file_url,
     )
Beispiel #14
0
    def test_feature_table_import_export_yaml(self):

        batch_source = FileSource(
            field_mapping={
                "ride_distance": "ride_distance",
                "ride_duration": "ride_duration",
            },
            file_format="parquet",
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            field_mapping={
                "ride_distance": "ride_distance",
                "ride_duration": "ride_duration",
            },
            bootstrap_servers="localhost:9094",
            class_path="random/path/to/class",
            topic="test_topic",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
        )

        test_feature_table = FeatureTable(
            name="car_driver",
            features=[
                Feature(name="ride_distance", dtype=ValueType.FLOAT),
                Feature(name="ride_duration", dtype=ValueType.STRING),
            ],
            entities=["car_driver_entity"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Create a string YAML representation of the feature table
        string_yaml = test_feature_table.to_yaml()

        # Create a new feature table object from the YAML string
        actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml)

        # Ensure equality is upheld to original feature table
        assert test_feature_table == actual_feature_table_from_string
Beispiel #15
0
def basic_featuretable():
    batch_source = FileSource(
        field_mapping={
            "dev_entity": "dev_entity_field",
            "dev_feature_float": "dev_feature_float_field",
            "dev_feature_string": "dev_feature_string_field",
        },
        file_format="PARQUET",
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    stream_source = KafkaSource(
        field_mapping={
            "dev_entity": "dev_entity_field",
            "dev_feature_float": "dev_feature_float_field",
            "dev_feature_string": "dev_feature_string_field",
        },
        bootstrap_servers="localhost:9094",
        class_path="random/path/to/class",
        topic="test_topic",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
    )
    return FeatureTable(
        name="basic_featuretable",
        entities=["driver_id", "customer_id"],
        features=[
            Feature(name="dev_feature_float", dtype=ValueType.FLOAT),
            Feature(name="dev_feature_string", dtype=ValueType.STRING),
        ],
        max_age=Duration(seconds=3600),
        batch_source=batch_source,
        stream_source=stream_source,
        labels={
            "key1": "val1",
            "key2": "val2"
        },
    )
Beispiel #16
0
    def _create_ft(self, client: Client, features) -> None:
        entity = Entity(
            name="driver_car_id",
            description="Car driver id",
            value_type=ValueType.STRING,
            labels={"team": "matchmaking"},
        )

        # Register Entity with Core
        client.apply_entity(entity)

        # Create Feature Tables
        batch_source = FileSource(
            file_format=ParquetFormat(),
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            bootstrap_servers="localhost:9094",
            message_format=ProtoFormat("class.path"),
            topic="test_topic",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
        )

        ft1 = FeatureTable(
            name=self.table_name,
            features=features,
            entities=["driver_car_id"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Register Feature Table with Core
        client.apply_feature_table(ft1)
Beispiel #17
0
def stage_entities_to_fs(entity_source: pd.DataFrame,
                         staging_location: str) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme)
    with tempfile.NamedTemporaryFile() as df_export_path:
        entity_source.to_parquet(df_export_path.name)
        bucket = (None if entity_staging_uri.scheme == "file" else
                  entity_staging_uri.netloc)
        staging_client.upload_file(df_export_path.name, bucket,
                                   entity_staging_uri.path.lstrip("/"))

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )
Beispiel #18
0
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.

wine_features_table = FileSource(
    event_timestamp_column="datetime",
    path=
    "/Users/julesdamji/examples/py/feature_store/data/wine_features.parquet")

# Define an entity for the weather features. You can think of entity as a primary key used to
# fetch features.
acidity = Entity(name="volatile_acidity",
                 value_type=ValueType.DOUBLE,
                 description="acidity")

# Our parquet files contain serving data that includes four 10 columns. Here we define a Feature View that will allow us to serve this
# data to our model online.
wine_features_view = FeatureView(
    name="wine_features",
    entities=["fixed_acidity"],
    ttl=Duration(seconds=86400 * 1),
    features=[
        Feature(name="volatile_acidity", dtype=ValueType.DOUBLE),
        Feature(name="citric_acid", dtype=ValueType.DOUBLE),
Beispiel #19
0
def create_driver_hourly_stats_source(parquet_path):
    return FileSource(
        path=parquet_path,
        event_timestamp_column="datetime",
        created_timestamp_column="created",
    )
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
driver_hourly_stats = FileSource(
    path=
    "/home/ec2-user/SageMaker/feast_fraud_demo/famous_lemur/data/driver_stats.parquet",
    event_timestamp_column="datetime",
    created_timestamp_column="created",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
Beispiel #21
0
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
driver_hourly_stats = FileSource(
    path="/home/chapman/Documents/feast-start/feast_repo/data/driver_stats.parquet",
    event_timestamp_column="datetime",
    created_timestamp_column="created",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 1),
    features=[
        Feature(name="conv_rate", dtype=ValueType.FLOAT),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
Beispiel #22
0
    # create dummy entity since Feast demands it
    entity_2 = Entity(
        name="dummy_entity_2",
        description="Dummy entity 2",
        value_type=ValueType.INT32,
        labels={"key": "val"},
    )

    # commit entities
    test_client.apply([entity_1, entity_2])

    # dummy file source
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    # first feature table for testing, with all of Feast's datatypes
    table_1 = FeatureTable(
        name="test_feature_table_all_feature_dtypes",
        features=[
            Feature(name="test_BYTES_feature", dtype=ValueType.BYTES),
            Feature(name="test_STRING_feature", dtype=ValueType.STRING),
            Feature(name="test_INT32_feature", dtype=ValueType.INT32),
            Feature(name="test_INT64_feature", dtype=ValueType.INT64),
            Feature(name="test_DOUBLE_feature", dtype=ValueType.DOUBLE),
            Feature(name="test_FLOAT_feature", dtype=ValueType.FLOAT),
            Feature(name="test_BOOL_feature", dtype=ValueType.BOOL),
Beispiel #23
0
    def test_apply_feature_table_integration(self, test_client):

        # Create Feature Tables
        batch_source = FileSource(
            file_format=ParquetFormat(),
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            bootstrap_servers="localhost:9094",
            message_format=ProtoFormat("class.path"),
            topic="test_topic",
            event_timestamp_column="ts_col",
        )

        ft1 = FeatureTable(
            name="my-feature-table-1",
            features=[
                Feature(name="fs1-my-feature-1", dtype=ValueType.INT64),
                Feature(name="fs1-my-feature-2", dtype=ValueType.STRING),
                Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST),
                Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST),
            ],
            entities=["fs1-my-entity-1"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Register Feature Table with Core
        test_client.apply(ft1)

        feature_tables = test_client.list_feature_tables()

        # List Feature Tables
        assert (len(feature_tables) == 1
                and feature_tables[0].name == "my-feature-table-1"
                and feature_tables[0].features[0].name == "fs1-my-feature-1"
                and feature_tables[0].features[0].dtype == ValueType.INT64
                and feature_tables[0].features[1].name == "fs1-my-feature-2"
                and feature_tables[0].features[1].dtype == ValueType.STRING
                and feature_tables[0].features[2].name == "fs1-my-feature-3"
                and feature_tables[0].features[2].dtype
                == ValueType.STRING_LIST
                and feature_tables[0].features[3].name == "fs1-my-feature-4"
                and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST
                and feature_tables[0].entities[0] == "fs1-my-entity-1")

        feature_table = test_client.get_feature_table("my-feature-table-1")
        assert (feature_table.name == "my-feature-table-1"
                and feature_table.features[0].name == "fs1-my-feature-1"
                and feature_table.features[0].dtype == ValueType.INT64
                and feature_table.features[1].name == "fs1-my-feature-2"
                and feature_table.features[1].dtype == ValueType.STRING
                and feature_table.features[2].name == "fs1-my-feature-3"
                and feature_table.features[2].dtype == ValueType.STRING_LIST
                and feature_table.features[3].name == "fs1-my-feature-4"
                and feature_table.features[3].dtype == ValueType.BYTES_LIST
                and feature_table.entities[0] == "fs1-my-entity-1")

        test_client.delete_feature_table("my-feature-table-1")
        feature_tables = test_client.list_feature_tables()
        assert len(feature_tables) == 0
Beispiel #24
0
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
driver_hourly_stats = FileSource(
    path="%PARQUET_PATH%",
    event_timestamp_column="datetime",
    created_timestamp_column="created",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 1),
Beispiel #25
0
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
driver_hourly_stats = FileSource(
    path=
    "/Users/julesdamji/examples/py/feature_store/feature_repo/data/driver_stats.parquet",
    event_timestamp_column="datetime",
    created_timestamp_column="created",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
from google.protobuf.duration_pb2 import Duration

from feast import Entity, FeatureView, ValueType
from feast.data_source import FileSource

driver_hourly_stats = FileSource(
    path="%PARQUET_PATH%",  # placeholder to be replaced by the test
    created_timestamp_column="created",
)

driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",)

# features are inferred from columns of data source
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 1),
    online=True,
    input=driver_hourly_stats,
    tags={},
)
Beispiel #27
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        project: str = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be exported to the staging
                location as parquet file. It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            project: Specifies the project that contains the feature tables
                which the requested features belong to.

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source, project="my_project")
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, project)
        output_location = os.path.join(
            self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION),
            str(uuid.uuid4()),
        )
        output_format = self._config.get(
            CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT)

        if isinstance(entity_source, pd.DataFrame):
            staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION)
            entity_staging_uri = urlparse(
                os.path.join(staging_location, str(uuid.uuid4())))
            staging_client = get_staging_client(entity_staging_uri.scheme)
            with tempfile.NamedTemporaryFile() as df_export_path:
                entity_source.to_parquet(df_export_path.name)
                bucket = (None if entity_staging_uri.scheme == "file" else
                          entity_staging_uri.netloc)
                staging_client.upload_file(df_export_path.name, bucket,
                                           entity_staging_uri.path.lstrip("/"))
                entity_source = FileSource(
                    "event_timestamp",
                    "created_timestamp",
                    ParquetFormat(),
                    entity_staging_uri.geturl(),
                )

        return start_historical_feature_retrieval_job(
            self,
            entity_source,
            feature_tables,
            output_format,
            os.path.join(output_location, str(uuid.uuid4())),
        )
Beispiel #28
0
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
# if its parquet, it can just be a folder of parquet files, based on the parquet
# format - then you can keep appending to the folder as required.
batch_source = FileSource(
    path="/home/chapman/Documents/feast-start/feature_multi/data/events",
    event_timestamp_column="event_timestamp",
    created_timestamp_column="created_timestamp",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
customer = Entity(
    name="user_id",
    value_type=ValueType.INT64,
    description="customer id for transactions",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
customer_events = FeatureView(
    name="customer_events",
Beispiel #29
0
# This is an example feature definition file

from google.protobuf.duration_pb2 import Duration

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource

# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
batch_source = FileSource(
    path=
    "/home/chapman/Documents/feast-start/feature_transaction/data/transactions.parquet",
    event_timestamp_column="event_timestamp",
    created_timestamp_column="created_timestamp",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
customer = Entity(
    name="user_id",
    value_type=ValueType.INT64,
    description="customer id for transactions",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
customer_transactions = FeatureView(
    name="customer_transactions",
    entities=["user_id"],
Beispiel #30
0
# Feature definition

from datetime import datetime
from pathlib import Path

from feast import Entity, Feature, FeatureView, ValueType
from feast.data_source import FileSource
from google.protobuf.duration_pb2 import Duration

from config import config

# Read data
START_TIME = "2020-02-17"
project_details = FileSource(
    path=str(Path(config.DATA_DIR, "features.parquet")),
    event_timestamp_column="created_on",
)

# Define an entity for the project
project = Entity(
    name="id",
    value_type=ValueType.INT64,
    description="project id",
)

# Define a Feature View for each project
# Can be used for fetching historical data and online serving
project_details_view = FeatureView(
    name="project_details",
    entities=["id"],
    ttl=Duration(seconds=(datetime.today() -