Ejemplo n.º 1
0
def test_partial() -> None:
    """
    Add another table to existing repo using partial apply API. Make sure both the table
    applied via CLI apply and the new table are passing RW test.
    """

    runner = CliRunner()
    with runner.local_repo(get_example_repo("example_feature_repo_1.py"),
                           "bigquery") as store:

        driver_locations_source = BigQuerySource(
            table="feast-oss.public.drivers",
            timestamp_field="event_timestamp",
            created_timestamp_column="created_timestamp",
        )

        driver_locations_100 = FeatureView(
            name="driver_locations_100",
            entities=["driver"],
            ttl=timedelta(days=1),
            schema=[
                Field(name="lat", dtype=Float32),
                Field(name="lon", dtype=String),
                Field(name="name", dtype=String),
            ],
            online=True,
            batch_source=driver_locations_source,
            tags={},
        )

        store.apply([driver_locations_100])

        basic_rw_test(store, view_name="driver_locations")
        basic_rw_test(store, view_name="driver_locations_100")
Ejemplo n.º 2
0
def create_item_embeddings_feature_view(source, infer_features: bool = False):
    item_embeddings_feature_view = FeatureView(
        name="item_embeddings",
        entities=["item"],
        schema=None if infer_features else [
            Field(name="embedding_double", dtype=Array(Float64)),
            Field(name="embedding_float", dtype=Array(Float32)),
        ],
        batch_source=source,
        ttl=timedelta(hours=2),
    )
    return item_embeddings_feature_view
Ejemplo n.º 3
0
def create_customer_daily_profile_feature_view(source,
                                               infer_features: bool = False):
    customer_profile_feature_view = FeatureView(
        name="customer_profile",
        entities=["customer_id"],
        schema=None if infer_features else [
            Field(name="current_balance", dtype=Float32),
            Field(name="avg_passenger_count", dtype=Float32),
            Field(name="lifetime_trip_count", dtype=Int32),
        ],
        source=source,
        ttl=timedelta(days=2),
    )
    return customer_profile_feature_view
Ejemplo n.º 4
0
def create_driver_hourly_stats_feature_view(source,
                                            infer_features: bool = False):
    driver_stats_feature_view = FeatureView(
        name="driver_stats",
        entities=["driver"],
        schema=None if infer_features else [
            Field(name="conv_rate", dtype=Float32),
            Field(name="acc_rate", dtype=Float32),
            Field(name="avg_daily_trips", dtype=Int32),
        ],
        source=source,
        ttl=timedelta(hours=2),
    )
    return driver_stats_feature_view
Ejemplo n.º 5
0
def conv_rate_plus_100_feature_view(
    sources: Dict[str, Union[RequestSource, FeatureView]],
    infer_features: bool = False,
    features: Optional[List[Field]] = None,
) -> OnDemandFeatureView:
    # Test that positional arguments and Features still work for ODFVs.
    _features = features or [
        Field(name="conv_rate_plus_100", dtype=Float64),
        Field(name="conv_rate_plus_val_to_add", dtype=Float64),
        Field(name="conv_rate_plus_100_rounded", dtype=Int32),
    ]
    return OnDemandFeatureView(
        name=conv_rate_plus_100.__name__,
        schema=[] if infer_features else _features,
        sources=sources,
        udf=conv_rate_plus_100,
    )
Ejemplo n.º 6
0
def similarity_feature_view(
    sources: Dict[str, Union[RequestSource, FeatureView]],
    infer_features: bool = False,
    features: Optional[List[Feature]] = None,
) -> OnDemandFeatureView:
    _fields = [
        Field(name="cos_double", dtype=Float64),
        Field(name="cos_float", dtype=Float32),
    ]
    if features is not None:
        _fields = [Field.from_feature(feature) for feature in features]

    return OnDemandFeatureView(
        name=similarity.__name__,
        sources=sources,
        schema=[] if infer_features else _fields,
        udf=similarity,
    )
Ejemplo n.º 7
0
def create_order_feature_view(source, infer_features: bool = False):
    return FeatureView(
        name="order",
        entities=["driver", "customer_id"],
        schema=None
        if infer_features else [Field(name="order_is_success", dtype=Int32)],
        source=source,
        ttl=timedelta(days=2),
    )
Ejemplo n.º 8
0
def create_location_stats_feature_view(source, infer_features: bool = False):
    location_stats_feature_view = FeatureView(
        name="location_stats",
        entities=[location()],
        schema=None
        if infer_features else [Field(name="temperature", dtype=Int32)],
        source=source,
        ttl=timedelta(days=2),
    )
    return location_stats_feature_view
Ejemplo n.º 9
0
def driver_feature_view(
    data_source: DataSource,
    name="test_correctness",
    infer_features: bool = False,
    dtype: FeastType = Float32,
) -> FeatureView:
    return FeatureView(
        name=name,
        entities=["driver"],
        schema=None if infer_features else [Field(name="value", dtype=dtype)],
        ttl=timedelta(days=5),
        source=data_source,
    )
Ejemplo n.º 10
0
def setup_feature_store():
    """Prepares the local environment for a FeatureStore docstring test."""
    from datetime import datetime, timedelta

    from feast import Entity, FeatureStore, FeatureView, Field, FileSource, ValueType
    from feast.repo_operations import init_repo
    from feast.types import Float32, Int64

    init_repo("feature_repo", "local")
    fs = FeatureStore(repo_path="feature_repo")
    driver = Entity(
        name="driver_id",
        value_type=ValueType.INT64,
        description="driver id",
    )
    driver_hourly_stats = FileSource(
        path="feature_repo/data/driver_stats.parquet",
        timestamp_field="event_timestamp",
        created_timestamp_column="created",
    )
    driver_hourly_stats_view = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=86400 * 1),
        schema=[
            Field(name="conv_rate", dtype=Float32),
            Field(name="acc_rate", dtype=Float32),
            Field(name="avg_daily_trips", dtype=Int64),
        ],
        batch_source=driver_hourly_stats,
    )
    fs.apply([driver_hourly_stats_view, driver])
    fs.materialize(
        start_date=datetime.utcnow() - timedelta(hours=3),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
Ejemplo n.º 11
0
def test_infer_odfv_features_with_error(environment, universal_data_sources):
    store = environment.feature_store

    (entities, datasets, data_sources) = universal_data_sources

    features = [Field(name="conv_rate_plus_200", dtype=Float64)]
    driver_hourly_stats = create_driver_hourly_stats_batch_feature_view(
        data_sources.driver)
    request_source = create_conv_rate_request_source()
    driver_odfv = conv_rate_plus_100_feature_view(
        [driver_hourly_stats, request_source],
        features=features,
    )

    feast_objects = [driver_hourly_stats, driver_odfv, driver(), customer()]
    with pytest.raises(SpecifiedFeaturesNotPresentError):
        store.apply(feast_objects)
Ejemplo n.º 12
0
    join_keys=["driver_id"],
    value_type=ValueType.INT64,
    description="driver id",
)

customer = Entity(
    name="customer",  # The name is derived from this argument, not object name.
    join_keys=["customer_id"],
    value_type=ValueType.STRING,
)

driver_locations = FeatureView(
    name="driver_locations",
    entities=["driver"],
    ttl=timedelta(days=1),
    schema=[Field(name="lat", dtype=Float32),
            Field(name="lon", dtype=String)],
    online=True,
    batch_source=driver_locations_source,
    tags={},
)

pushed_driver_locations = FeatureView(
    name="pushed_driver_locations",
    entities=["driver"],
    ttl=timedelta(days=1),
    schema=[
        Field(name="driver_lat", dtype=Float32),
        Field(name="driver_long", dtype=String),
    ],
    online=True,
Ejemplo n.º 13
0
# Feature views are a grouping based on how features are stored in either the
# online or offline store.
driver_stats_fv = FeatureView(
    # The unique name of this feature view. Two feature views in a single
    # project cannot have the same name
    name="driver_hourly_stats",
    # The list of entities specifies the keys required for joining or looking
    # up features from this feature view. The reference provided in this field
    # correspond to the name of a defined entity (or entities)
    entities=["driver"],
    # The timedelta is the maximum age that each feature value may have
    # relative to its lookup time. For historical features (used in training),
    # TTL is relative to each timestamp provided in the entity dataframe.
    # TTL also allows for eviction of keys from online stores and limits the
    # amount of historical scanning required for historical feature values
    # during retrieval
    ttl=timedelta(weeks=52),
    # The list of features defined below act as a schema to both define features
    # for both materialization of features into a store, and are used as references
    # during retrieval for building a training dataset or serving features
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    # Batch sources are used to find feature values. In the case of this feature
    # view we will query a source table on Redshift for driver statistics
    # features
    batch_source=driver_stats_source,
)
Ejemplo n.º 14
0
def test_write_to_online_store_event_check(local_redis_environment):
    if os.getenv("FEAST_IS_LOCAL_TEST", "False") == "True":
        return
    fs = local_redis_environment.feature_store

    # write same data points 3 with different timestamps
    now = pd.Timestamp(datetime.datetime.utcnow()).round("ms")
    hour_ago = pd.Timestamp(datetime.datetime.utcnow() -
                            timedelta(hours=1)).round("ms")
    latest = pd.Timestamp(datetime.datetime.utcnow() +
                          timedelta(seconds=1)).round("ms")

    data = {
        "id": [123, 567, 890],
        "string_col": ["OLD_FEATURE", "LATEST_VALUE2", "LATEST_VALUE3"],
        "ts_1": [hour_ago, now, now],
    }
    dataframe_source = pd.DataFrame(data)
    with prep_file_source(df=dataframe_source,
                          event_timestamp_column="ts_1") as file_source:
        e = Entity(name="id", value_type=ValueType.STRING)

        # Create Feature View
        fv1 = FeatureView(
            name="feature_view_123",
            schema=[Field(name="string_col", dtype=String)],
            entities=["id"],
            batch_source=file_source,
            ttl=timedelta(minutes=5),
        )
        # Register Feature View and Entity
        fs.apply([fv1, e])

        #  data to ingest into Online Store (recent)
        data = {
            "id": [123],
            "string_col": ["hi_123"],
            "ts_1": [now],
        }
        df_data = pd.DataFrame(data)

        # directly ingest data into the Online Store
        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(features=["feature_view_123:string_col"],
                                    entity_rows=[{
                                        "id": 123
                                    }]).to_df()
        assert df["string_col"].iloc[0] == "hi_123"

        # data to ingest into Online Store (1 hour delayed data)
        # should now overwrite features for id=123 because it's less recent data
        data = {
            "id": [123, 567, 890],
            "string_col": ["bye_321", "hello_123", "greetings_321"],
            "ts_1": [hour_ago, hour_ago, hour_ago],
        }
        df_data = pd.DataFrame(data)

        # directly ingest data into the Online Store
        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "hi_123"
        assert df["string_col"].iloc[1] == "hello_123"
        assert df["string_col"].iloc[2] == "greetings_321"

        # should overwrite string_col for id=123 because it's most recent based on event_timestamp
        data = {
            "id": [123],
            "string_col": ["LATEST_VALUE"],
            "ts_1": [latest],
        }
        df_data = pd.DataFrame(data)

        fs.write_to_online_store("feature_view_123", df_data)

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "LATEST_VALUE"
        assert df["string_col"].iloc[1] == "hello_123"
        assert df["string_col"].iloc[2] == "greetings_321"

        # writes to online store via datasource (dataframe_source) materialization
        fs.materialize(
            start_date=datetime.datetime.now() - timedelta(hours=12),
            end_date=datetime.datetime.utcnow(),
        )

        df = fs.get_online_features(
            features=["feature_view_123:string_col"],
            entity_rows=[{
                "id": 123
            }, {
                "id": 567
            }, {
                "id": 890
            }],
        ).to_df()
        assert df["string_col"].iloc[0] == "LATEST_VALUE"
        assert df["string_col"].iloc[1] == "LATEST_VALUE2"
        assert df["string_col"].iloc[2] == "LATEST_VALUE3"
Ejemplo n.º 15
0
)
customer_daily_profile = SparkSource(
    name="customer_daily_profile",
    path=f"{CURRENT_DIR}/data/customer_daily_profile.parquet",
    file_format="parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
)

# Feature Views
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver"],
    ttl=timedelta(days=7),
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    source=driver_hourly_stats,
    tags={},
)
customer_daily_profile_view = FeatureView(
    name="customer_daily_profile",
    entities=["customer"],
    ttl=timedelta(days=7),
    schema=[
        Field(name="current_balance", dtype=Float32),
        Field(name="avg_passenger_count", dtype=Float32),
        Field(name="lifetime_trip_count", dtype=Int64),
Ejemplo n.º 16
0
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
)

driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=timedelta(days=1),
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

global_daily_stats = FileSource(
    path="%PARQUET_PATH_GLOBAL%",  # placeholder to be replaced by the test
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
)

global_stats_feature_view = FeatureView(
Ejemplo n.º 17
0
def create_conv_rate_request_source():
    return RequestSource(
        name="conv_rate_input",
        schema=[Field(name="val_to_add", dtype=Int32)],
    )
Ejemplo n.º 18
0
    labels={"owner": "*****@*****.**", "team": "hack week",},
)

zipcode_source = FileSource(
    name="zipcode",
    path="data/zipcode_table.parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp",
)

zipcode_features = FeatureView(
    name="zipcode_features",
    entities=["zipcode"],
    ttl=timedelta(days=3650),
    schema=[
        Field(name="city", dtype=String),
        Field(name="state", dtype=String),
        Field(name="location_type", dtype=String),
        Field(name="tax_returns_filed", dtype=Int64),
        Field(name="population", dtype=Int64),
        Field(name="total_wages", dtype=Int64),
    ],
    batch_source=zipcode_source,
    tags={
        "date_added": "2022-02-7",
        "experiments": "experiment-A,experiment-B,experiment-C",
        "access_group": "*****@*****.**",
    },
    online=True,
)