コード例 #1
0
ファイル: test_partial_apply.py プロジェクト: feast-dev/feast
def test_partial() -> None:
    """
    Add another table to existing repo using partial apply API. Make sure both the table
    applied via CLI apply and the new table are passing RW test.
    """

    runner = CliRunner()
    with runner.local_repo(get_example_repo("example_feature_repo_1.py"),
                           "bigquery") as store:

        driver_locations_source = BigQuerySource(
            table="feast-oss.public.drivers",
            timestamp_field="event_timestamp",
            created_timestamp_column="created_timestamp",
        )

        driver_locations_100 = FeatureView(
            name="driver_locations_100",
            entities=["driver"],
            ttl=timedelta(days=1),
            schema=[
                Field(name="lat", dtype=Float32),
                Field(name="lon", dtype=String),
                Field(name="name", dtype=String),
            ],
            online=True,
            batch_source=driver_locations_source,
            tags={},
        )

        store.apply([driver_locations_100])

        basic_rw_test(store, view_name="driver_locations")
        basic_rw_test(store, view_name="driver_locations_100")
コード例 #2
0
    def test_basic(self) -> None:
        """
            Add another table to existing repo using partial apply API. Make sure both the table
            applied via CLI apply and the new table are passing RW test.
        """

        runner = CliRunner()
        with runner.local_repo(get_example_repo("example_feature_repo_1.py")) as store:

            driver_locations_source = BigQuerySource(
                table_ref="rh_prod.ride_hailing_co.drivers",
                event_timestamp_column="event_timestamp",
                created_timestamp_column="created_timestamp",
            )

            driver_locations_100 = FeatureView(
                name="driver_locations_100",
                entities=["driver"],
                ttl=Duration(seconds=86400 * 1),
                features=[
                    Feature(name="lat", dtype=ValueType.FLOAT),
                    Feature(name="lon", dtype=ValueType.STRING),
                    Feature(name="name", dtype=ValueType.STRING),
                ],
                online=True,
                input=driver_locations_source,
                tags={},
            )

            store.apply([driver_locations_100])

            basic_rw_test(store, view_name="driver_locations")
            basic_rw_test(store, view_name="driver_locations_100")
コード例 #3
0
def store_offline(feature_store: FeatureStore,
                  dataframe: FlyteSchema) -> FeatureStore:
    horse_colic_entity = Entity(name="Hospital Number",
                                value_type=ValueType.STRING)

    horse_colic_feature_view = FeatureView(
        name="horse_colic_stats",
        entities=["Hospital Number"],
        features=[
            Feature(name="rectal temperature", dtype=ValueType.FLOAT),
            Feature(name="total protein", dtype=ValueType.FLOAT),
            Feature(name="peripheral pulse", dtype=ValueType.FLOAT),
            Feature(name="surgical lesion", dtype=ValueType.STRING),
            Feature(name="abdominal distension", dtype=ValueType.FLOAT),
            Feature(name="nasogastric tube", dtype=ValueType.STRING),
            Feature(name="outcome", dtype=ValueType.STRING),
            Feature(name="packed cell volume", dtype=ValueType.FLOAT),
            Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT),
        ],
        batch_source=FileSource(
            path=str(dataframe.remote_path),
            event_timestamp_column="timestamp",
        ),
        ttl=timedelta(days=1),
    )

    # Ingest the data into feast
    feature_store.apply([horse_colic_entity, horse_colic_feature_view])

    return feature_store
コード例 #4
0
def setup_feature_store():
    """Prepares the local environment for a FeatureStore docstring test."""
    from datetime import datetime, timedelta

    from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType
    from feast.repo_operations import init_repo

    init_repo("feature_repo", "local")
    fs = FeatureStore(repo_path="feature_repo")
    driver = Entity(
        name="driver_id",
        value_type=ValueType.INT64,
        description="driver id",
    )
    driver_hourly_stats = FileSource(
        path="feature_repo/data/driver_stats.parquet",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created",
    )
    driver_hourly_stats_view = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=86400 * 1),
        features=[
            Feature(name="conv_rate", dtype=ValueType.FLOAT),
            Feature(name="acc_rate", dtype=ValueType.FLOAT),
            Feature(name="avg_daily_trips", dtype=ValueType.INT64),
        ],
        batch_source=driver_hourly_stats,
    )
    fs.apply([driver_hourly_stats_view, driver])
    fs.materialize(
        start_date=datetime.utcnow() - timedelta(hours=3),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
コード例 #5
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_order_feature_view(source, infer_features: bool = False):
    return FeatureView(
        name="order",
        entities=["driver", "customer_id"],
        schema=None
        if infer_features else [Field(name="order_is_success", dtype=Int32)],
        source=source,
        ttl=timedelta(days=2),
    )
コード例 #6
0
def create_order_feature_view(source, infer_features: bool = False):
    return FeatureView(
        name="order",
        entities=["driver", "customer_id"],
        features=None if infer_features else
        [Feature(name="order_is_success", dtype=ValueType.INT32)],
        batch_source=source,
        ttl=timedelta(days=2),
    )
コード例 #7
0
def driver_feature_view(data_source: DataSource,
                        name="test_correctness") -> FeatureView:
    return FeatureView(
        name=name,
        entities=["driver"],
        features=[Feature("value", ValueType.FLOAT)],
        ttl=timedelta(days=5),
        input=data_source,
    )
コード例 #8
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_field_mapping_feature_view(source):
    return FeatureView(
        name="field_mapping",
        entities=[],
        # Test that Features still work for FeatureViews.
        features=[Feature(name="feature_name", dtype=ValueType.INT32)],
        source=source,
        ttl=timedelta(days=2),
    )
コード例 #9
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_location_stats_feature_view(source, infer_features: bool = False):
    location_stats_feature_view = FeatureView(
        name="location_stats",
        entities=[location()],
        schema=None
        if infer_features else [Field(name="temperature", dtype=Int32)],
        source=source,
        ttl=timedelta(days=2),
    )
    return location_stats_feature_view
コード例 #10
0
ファイル: feature_views.py プロジェクト: pyalex/feast
def create_location_stats_feature_view(source, infer_features: bool = False):
    location_stats_feature_view = FeatureView(
        name="location_stats",
        entities=["location_id"],
        features=None if infer_features else
        [Feature(name="temperature", dtype=ValueType.INT32)],
        batch_source=source,
        ttl=timedelta(days=2),
    )
    return location_stats_feature_view
コード例 #11
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_item_embeddings_feature_view(source, infer_features: bool = False):
    item_embeddings_feature_view = FeatureView(
        name="item_embeddings",
        entities=["item"],
        schema=None if infer_features else [
            Field(name="embedding_double", dtype=Array(Float64)),
            Field(name="embedding_float", dtype=Array(Float32)),
        ],
        batch_source=source,
        ttl=timedelta(hours=2),
    )
    return item_embeddings_feature_view
コード例 #12
0
def create_global_stats_feature_view(source, infer_features: bool = False):
    global_stats_feature_view = FeatureView(
        name="global_stats",
        entities=[],
        features=None if infer_features else [
            Feature(name="num_rides", dtype=ValueType.INT32),
            Feature(name="avg_ride_length", dtype=ValueType.FLOAT),
        ],
        batch_source=source,
        ttl=timedelta(days=2),
    )
    return global_stats_feature_view
コード例 #13
0
def create_item_embeddings_feature_view(source, infer_features: bool = False):
    item_embeddings_feature_view = FeatureView(
        name="item_embeddings",
        entities=["item"],
        features=None if infer_features else [
            Feature(name="embedding_double", dtype=ValueType.DOUBLE_LIST),
            Feature(name="embedding_float", dtype=ValueType.FLOAT_LIST),
        ],
        batch_source=source,
        ttl=timedelta(hours=2),
    )
    return item_embeddings_feature_view
コード例 #14
0
def create_customer_daily_profile_feature_view(source):
    customer_profile_feature_view = FeatureView(
        name="customer_profile",
        entities=["customer_id"],
        features=[
            Feature(name="current_balance", dtype=ValueType.FLOAT),
            Feature(name="avg_passenger_count", dtype=ValueType.FLOAT),
            Feature(name="lifetime_trip_count", dtype=ValueType.INT32),
        ],
        batch_source=source,
        ttl=timedelta(days=2),
    )
    return customer_profile_feature_view
コード例 #15
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def driver_feature_view(
    data_source: DataSource,
    name="test_correctness",
    infer_features: bool = False,
    dtype: FeastType = Float32,
) -> FeatureView:
    return FeatureView(
        name=name,
        entities=["driver"],
        schema=None if infer_features else [Field(name="value", dtype=dtype)],
        ttl=timedelta(days=5),
        source=data_source,
    )
コード例 #16
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_pushable_feature_view(batch_source: DataSource):
    push_source = PushSource(
        name="location_stats_push_source",
        batch_source=batch_source,
    )
    return FeatureView(
        name="pushable_location_stats",
        entities=["location_id"],
        # Test that Features still work for FeatureViews.
        features=[Feature(name="temperature", dtype=ValueType.INT32)],
        ttl=timedelta(days=2),
        source=push_source,
    )
コード例 #17
0
def create_driver_hourly_stats_feature_view(source):
    driver_stats_feature_view = FeatureView(
        name="driver_stats",
        entities=["driver"],
        features=[
            Feature(name="conv_rate", dtype=ValueType.FLOAT),
            Feature(name="acc_rate", dtype=ValueType.FLOAT),
            Feature(name="avg_daily_trips", dtype=ValueType.INT32),
        ],
        batch_source=source,
        ttl=timedelta(hours=2),
    )
    return driver_stats_feature_view
コード例 #18
0
ファイル: feature_views.py プロジェクト: qooba/feast
def global_feature_view(
    data_source: DataSource,
    name="test_entityless",
    infer_features: bool = False,
    value_type: ValueType = ValueType.INT32,
) -> FeatureView:
    return FeatureView(
        name=name,
        entities=[],
        features=None if infer_features else [Feature("entityless_value", value_type)],
        ttl=timedelta(days=5),
        input=data_source,
    )
コード例 #19
0
ファイル: feature_views.py プロジェクト: qooba/feast
def driver_feature_view(
    data_source: DataSource,
    name="test_correctness",
    infer_features: bool = False,
    value_type: ValueType = ValueType.FLOAT,
) -> FeatureView:
    return FeatureView(
        name=name,
        entities=["driver"],
        features=None if infer_features else [Feature("value", value_type)],
        ttl=timedelta(days=5),
        input=data_source,
    )
コード例 #20
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_driver_hourly_stats_feature_view(source,
                                            infer_features: bool = False):
    driver_stats_feature_view = FeatureView(
        name="driver_stats",
        entities=["driver"],
        schema=None if infer_features else [
            Field(name="conv_rate", dtype=Float32),
            Field(name="acc_rate", dtype=Float32),
            Field(name="avg_daily_trips", dtype=Int32),
        ],
        source=source,
        ttl=timedelta(hours=2),
    )
    return driver_stats_feature_view
コード例 #21
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def create_customer_daily_profile_feature_view(source,
                                               infer_features: bool = False):
    customer_profile_feature_view = FeatureView(
        name="customer_profile",
        entities=["customer_id"],
        schema=None if infer_features else [
            Field(name="current_balance", dtype=Float32),
            Field(name="avg_passenger_count", dtype=Float32),
            Field(name="lifetime_trip_count", dtype=Int32),
        ],
        source=source,
        ttl=timedelta(days=2),
    )
    return customer_profile_feature_view
コード例 #22
0
ファイル: feature_views.py プロジェクト: feast-dev/feast
def global_feature_view(
    data_source: DataSource,
    name="test_entityless",
    infer_features: bool = False,
    value_type: ValueType = ValueType.INT32,
) -> FeatureView:
    return FeatureView(
        name=name,
        entities=[],
        # Test that Features still work for FeatureViews.
        features=None if infer_features else
        [Feature(name="entityless_value", dtype=value_type)],
        ttl=timedelta(days=5),
        source=data_source,
    )
コード例 #23
0
ファイル: driver_repo.py プロジェクト: feast-dev/feast
# Feature views are a grouping based on how features are stored in either the
# online or offline store.
driver_stats_fv = FeatureView(
    # The unique name of this feature view. Two feature views in a single
    # project cannot have the same name
    name="driver_hourly_stats",
    # The list of entities specifies the keys required for joining or looking
    # up features from this feature view. The reference provided in this field
    # correspond to the name of a defined entity (or entities)
    entities=["driver"],
    # The timedelta is the maximum age that each feature value may have
    # relative to its lookup time. For historical features (used in training),
    # TTL is relative to each timestamp provided in the entity dataframe.
    # TTL also allows for eviction of keys from online stores and limits the
    # amount of historical scanning required for historical feature values
    # during retrieval
    ttl=timedelta(weeks=52),
    # The list of features defined below act as a schema to both define features
    # for both materialization of features into a store, and are used as references
    # during retrieval for building a training dataset or serving features
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    # Batch sources are used to find feature values. In the case of this feature
    # view we will query a source table on Redshift for driver statistics
    # features
    batch_source=driver_stats_source,
)
コード例 #24
0
# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 365),
    features=[
        Feature(name="conv_rate", dtype=ValueType.DOUBLE),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

fs = FeatureStore("")
fs.apply([driver_hourly_stats_view, driver])

now = datetime.now()
fs.materialize_incremental(now)
コード例 #25
0
# if its parquet, it can just be a folder of parquet files, based on the parquet
# format - then you can keep appending to the folder as required.
batch_source = FileSource(
    path="/home/chapman/Documents/feast-start/feature_multi/data/events",
    event_timestamp_column="event_timestamp",
    created_timestamp_column="created_timestamp",
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
customer = Entity(
    name="user_id",
    value_type=ValueType.INT64,
    description="customer id for transactions",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
customer_events = FeatureView(
    name="customer_events",
    entities=["user_id"],
    ttl=Duration(seconds=86400 * 1),
    features=[
        Feature(name="event", dtype=ValueType.STRING),
    ],
    online=True,
    input=batch_source,
    tags={},
)
コード例 #26
0
from datetime import timedelta

from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType

nonexistent_source = BigQuerySource(
    table_ref="project.dataset.nonexistent_table", event_timestamp_column=""
)

driver = Entity(name="driver", value_type=ValueType.INT64, description="driver id",)

nonexistent_features = FeatureView(
    name="driver_locations",
    entities=["driver"],
    ttl=timedelta(days=1),
    features=[
        Feature(name="lat", dtype=ValueType.FLOAT),
        Feature(name="lon", dtype=ValueType.STRING),
    ],
    input=nonexistent_source,
)
コード例 #27
0
def test_write_to_online_store_event_check(local_redis_environment):
    if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True":
        return
    fs = local_redis_environment.feature_store

    # write same data points 3 with different timestamps
    now = pd.Timestamp(datetime.datetime.utcnow()).round("ms")
    hour_ago = pd.Timestamp(datetime.datetime.utcnow() -
                            timedelta(hours=1)).round("ms")
    latest = pd.Timestamp(datetime.datetime.utcnow() +
                          timedelta(seconds=1)).round("ms")

    data = {
        "id": [123, 567, 890],
        "string_col": ["OLD_FEATURE", "LATEST_VALUE2", "LATEST_VALUE3"],
        "ts_1": [hour_ago, now, now],
    }
    dataframe_source = pd.DataFrame(data)
    with prep_file_source(df=dataframe_source,
                          event_timestamp_column="ts_1") as file_source:
        e = Entity(name="id", value_type=ValueType.STRING)

        # Create Feature View
        fv1 = FeatureView(
            name="feature_view_123",
            schema=[Field(name="string_col", dtype=String)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )
        # Register Feature View and Entity
        fs.apply([fv1, e])

        #  data to ingest into Online Store (recent)
        data = {
            "id": [123],
            "string_col": ["hi_123"],
            "ts_1": [now],
        }
        df_data = pd.DataFrame(data)

        # directly ingest data into the Online Store
        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(features=["feature_view_123:string_col"],
                                    entity_rows=[{
                                        "id": 123
                                    }]).to_df()
        assert df["string_col"].iloc[0] == "hi_123"

        # data to ingest into Online Store (1 hour delayed data)
        # should now overwrite features for id=123 because it's less recent data
        data = {
            "id": [123, 567, 890],
            "string_col": ["bye_321", "hello_123", "greetings_321"],
            "ts_1": [hour_ago, hour_ago, hour_ago],
        }
        df_data = pd.DataFrame(data)

        # directly ingest data into the Online Store
        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "hi_123"
        assert df["string_col"].iloc[1] == "hello_123"
        assert df["string_col"].iloc[2] == "greetings_321"

        # should overwrite string_col for id=123 because it's most recent based on event_timestamp
        data = {
            "id": [123],
            "string_col": ["LATEST_VALUE"],
            "ts_1": [latest],
        }
        df_data = pd.DataFrame(data)

        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "LATEST_VALUE"
        assert df["string_col"].iloc[1] == "hello_123"
        assert df["string_col"].iloc[2] == "greetings_321"

        # writes to online store via datasource (dataframe_source) materialization
        fs.materialize(
            start_date=datetime.datetime.now() - timedelta(hours=12),
            end_date=datetime.datetime.utcnow(),
        )

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "LATEST_VALUE"
        assert df["string_col"].iloc[1] == "LATEST_VALUE2"
        assert df["string_col"].iloc[2] == "LATEST_VALUE3"
コード例 #28
0
ファイル: materialize.py プロジェクト: Shopify/feast
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 7),
    features=[
        Feature(name="conv_rate", dtype=ValueType.DOUBLE),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)


# For Benchmarks
# Please read more in Feast RFC-031 (link https://docs.google.com/document/d/12UuvTQnTTCJhdRgy6h10zSbInNGSyEJkIxpOcgOen1I/edit)
# about this benchmark setup
def generate_data(num_rows: int, num_features: int, key_space: int,
                  destination: str) -> pd.DataFrame:
    features = [f"feature_{i}" for i in range(num_features)]
    columns = ["entity", "event_timestamp"] + features
コード例 #29
0
    value_type=ValueType.INT64,
    description="driver id",
)

customer = Entity(
    name="customer",  # The name is derived from this argument, not object name.
    value_type=ValueType.STRING,
)


driver_locations = FeatureView(
    name="driver_locations",
    entities=["driver"],
    ttl=timedelta(days=1),
    features=[
        Feature(name="lat", dtype=ValueType.FLOAT),
        Feature(name="lon", dtype=ValueType.STRING),
    ],
    online=True,
    batch_source=driver_locations_source,
    tags={},
)

customer_profile = FeatureView(
    name="customer_profile",
    entities=["customer"],
    ttl=timedelta(days=1),
    features=[
        Feature(name="avg_orders_day", dtype=ValueType.FLOAT),
        Feature(name="name", dtype=ValueType.STRING),
        Feature(name="age", dtype=ValueType.INT64),
    ],
コード例 #30
0
customer_daily_profile = SparkSource(
    name="customer_daily_profile",
    path=f"{CURRENT_DIR}/data/customer_daily_profile.parquet",
    file_format="parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
)

# Feature Views
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver"],
    ttl=timedelta(days=7),
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    source=driver_hourly_stats,
    tags={},
)
customer_daily_profile_view = FeatureView(
    name="customer_daily_profile",
    entities=["customer"],
    ttl=timedelta(days=7),
    schema=[
        Field(name="current_balance", dtype=Float32),
        Field(name="avg_passenger_count", dtype=Float32),
        Field(name="lifetime_trip_count", dtype=Int64),
    ],