Exemple #1
0
def test_usage_off():
    old_environ = dict(os.environ)
    test_usage_id = str(uuid.uuid4())
    os.environ["FEAST_IS_USAGE_TEST"] = "True"
    os.environ["FEAST_USAGE"] = "False"
    os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id

    with tempfile.TemporaryDirectory() as temp_dir:
        test_feature_store = FeatureStore(
            config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="fake_project",
                provider="local",
                online_store=SqliteOnlineStoreConfig(
                    path=os.path.join(temp_dir, "online.db")
                ),
            )
        )
        entity = Entity(
            name="driver_car_id",
            description="Car driver id",
            value_type=ValueType.STRING,
            labels={"team": "matchmaking"},
        )
        test_feature_store.apply([entity])

        os.environ.clear()
        os.environ.update(old_environ)
        sleep(30)
        rows = read_bigquery_usage_id(test_usage_id)
        assert rows.total_rows == 0
Exemple #2
0
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to AWS
    print("Deploying feature store to AWS...")
    fs.apply([driver, driver_hourly_stats_view])

    # Select features
    feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"]

    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print("Retrieving online features...")

    # Retrieve features from the online store (DynamoDB)
    online_features = fs.get_online_features(
        feature_refs=feature_refs,
        entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}],
    ).to_dict()

    print(pd.DataFrame.from_dict(online_features))
Exemple #3
0
def store_offline(feature_store: FeatureStore,
                  dataframe: FlyteSchema) -> FeatureStore:
    horse_colic_entity = Entity(name="Hospital Number",
                                value_type=ValueType.STRING)

    horse_colic_feature_view = FeatureView(
        name="horse_colic_stats",
        entities=["Hospital Number"],
        features=[
            Feature(name="rectal temperature", dtype=ValueType.FLOAT),
            Feature(name="total protein", dtype=ValueType.FLOAT),
            Feature(name="peripheral pulse", dtype=ValueType.FLOAT),
            Feature(name="surgical lesion", dtype=ValueType.STRING),
            Feature(name="abdominal distension", dtype=ValueType.FLOAT),
            Feature(name="nasogastric tube", dtype=ValueType.STRING),
            Feature(name="outcome", dtype=ValueType.STRING),
            Feature(name="packed cell volume", dtype=ValueType.FLOAT),
            Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT),
        ],
        batch_source=FileSource(
            path=str(dataframe.remote_path),
            event_timestamp_column="timestamp",
        ),
        ttl=timedelta(days=1),
    )

    # Ingest the data into feast
    feature_store.apply([horse_colic_entity, horse_colic_feature_view])

    return feature_store
Exemple #4
0
def test_telemetry_on():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "True"

    with tempfile.TemporaryDirectory() as temp_dir:
        test_feature_store = FeatureStore(
            config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="fake_project",
                provider="local",
                online_store=SqliteOnlineStoreConfig(
                    path=os.path.join(temp_dir, "online.db")
                ),
            )
        )
        entity = Entity(
            name="driver_car_id",
            description="Car driver id",
            value_type=ValueType.STRING,
            labels={"team": "matchmaking"},
        )

        test_feature_store.apply([entity])

        os.environ.clear()
        os.environ.update(old_environ)
        ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
Exemple #5
0
def setup_feature_store():
    """Prepares the local environment for a FeatureStore docstring test."""
    from datetime import datetime, timedelta

    from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType
    from feast.repo_operations import init_repo

    init_repo("feature_repo", "local")
    fs = FeatureStore(repo_path="feature_repo")
    driver = Entity(
        name="driver_id",
        value_type=ValueType.INT64,
        description="driver id",
    )
    driver_hourly_stats = FileSource(
        path="feature_repo/data/driver_stats.parquet",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created",
    )
    driver_hourly_stats_view = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=86400 * 1),
        features=[
            Feature(name="conv_rate", dtype=ValueType.FLOAT),
            Feature(name="acc_rate", dtype=ValueType.FLOAT),
            Feature(name="avg_daily_trips", dtype=ValueType.INT64),
        ],
        batch_source=driver_hourly_stats,
    )
    fs.apply([driver_hourly_stats_view, driver])
    fs.materialize(
        start_date=datetime.utcnow() - timedelta(hours=3),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
Exemple #6
0
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to Snowflake
    print("Deploying feature store to Snowflake...")
    fs.apply([driver, driver_stats_fv])

    # Select features
    features = [
        "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"
    ]

    # Create an entity dataframe. This is the dataframe that will be enriched with historical features
    entity_df = pd.DataFrame({
        "event_timestamp": [
            pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
            for dt in pd.date_range(
                start=datetime.now() - timedelta(days=3),
                end=datetime.now(),
                periods=3,
            )
        ],
        "driver_id": [1001, 1002, 1003],
    })

    print("Retrieving training data...")

    # Retrieve historical features by joining the entity dataframe to the Snowflake table source
    training_df = fs.get_historical_features(features=features,
                                             entity_df=entity_df).to_df()

    print()
    print(training_df)

    print()
    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print()
    print("Retrieving online features...")

    # Retrieve features from the online store
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver_id": 1001
        }, {
            "driver_id": 1002
        }],
    ).to_dict()

    print()
    print(pd.DataFrame.from_dict(online_features))
Exemple #7
0
def test_telemetry_on():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "True"

    test_feature_store = FeatureStore()
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    test_feature_store.apply([entity])

    os.environ.clear()
    os.environ.update(old_environ)
    ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
Exemple #8
0
def test_telemetry_off():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "False"
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id

    test_feature_store = FeatureStore()
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )
    test_feature_store.apply([entity])

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_telemetry_id(test_telemetry_id)
    assert rows.total_rows == 0
Exemple #9
0
# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 365),
    features=[
        Feature(name="conv_rate", dtype=ValueType.DOUBLE),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

fs = FeatureStore("")
fs.apply([driver_hourly_stats_view, driver])

now = datetime.now()
fs.materialize_incremental(now)
Exemple #10
0
)

benchmark_feature_views = [
    FeatureView(
        name=f"feature_view_{i}",
        entities=["entity"],
        ttl=Duration(seconds=86400),
        features=[
            Feature(name=f"feature_{10 * i + j}", dtype=ValueType.INT64)
            for j in range(10)
        ],
        online=True,
        batch_source=generated_data_source,
    ) for i in range(25)
]

benchmark_feature_service = FeatureService(
    name=f"benchmark_feature_service",
    features=benchmark_feature_views,
)

fs = FeatureStore(".")
fs.apply([
    driver_hourly_stats_view, driver, entity, benchmark_feature_service,
    *benchmark_feature_views
])

now = datetime.now()
fs.materialize(start, now)
print("Materialization finished")
Exemple #11
0
def construct_test_environment(
    test_repo_config: TestRepoConfig,
    create_and_apply: bool = False,
    materialize: bool = False,
) -> Environment:
    """
    This method should take in the parameters from the test repo config and created a feature repo, apply it,
    and return the constructed feature store object to callers.

    This feature store object can be interacted for the purposes of tests.
    The user is *not* expected to perform any clean up actions.

    :param test_repo_config: configuration
    :return: A feature store built using the supplied configuration.
    """
    df = create_dataset()

    project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}"

    module_name, config_class_name = test_repo_config.offline_store_creator.rsplit(
        ".", 1)

    offline_creator: DataSourceCreator = importer.get_class_from_type(
        module_name, config_class_name, "DataSourceCreator")(project)
    ds = offline_creator.create_data_source(project,
                                            df,
                                            field_mapping={
                                                "ts_1": "ts",
                                                "id": "driver_id"
                                            })
    offline_store = offline_creator.create_offline_store_config()
    online_store = test_repo_config.online_store

    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=project,
            provider=test_repo_config.provider,
            offline_store=offline_store,
            online_store=online_store,
            repo_path=repo_dir_name,
        )
        fs = FeatureStore(config=config)
        environment = Environment(
            name=project,
            test_repo_config=test_repo_config,
            feature_store=fs,
            data_source=ds,
            data_source_creator=offline_creator,
        )

        fvs = []
        entities = []
        try:
            if create_and_apply:
                entities.extend([driver(), customer()])
                fvs.extend([
                    environment.driver_stats_feature_view(),
                    environment.customer_feature_view(),
                ])
                fs.apply(fvs + entities)

            if materialize:
                fs.materialize(environment.start_date, environment.end_date)

            yield environment
        finally:
            offline_creator.teardown()
            fs.teardown()
Exemple #12
0
def generate_data(num_rows: int, num_features: int,
                  destination: str) -> pd.DataFrame:
    features = [f"feature_{i}" for i in range(num_features)]
    columns = ["entity", "event_timestamp"] + features
    df = pd.DataFrame(0, index=np.arange(num_rows), columns=columns)
    df["event_timestamp"] = datetime.utcnow()
    for column in features:
        df[column] = np.random.randint(1, num_rows, num_rows)

    df["entity"] = "key-" + pd.Series(np.arange(1, num_rows + 1)).astype(
        pd.StringDtype())

    df.to_parquet(destination)


generate_data(10**3, 250, "benchmark_data.parquet")

fs = FeatureStore(".")
fs.apply([
    driver_hourly_stats_view,
    transformed_conv_rate,
    driver,
    entity,
    benchmark_feature_service,
    *benchmark_feature_views,
])

now = datetime.now()
fs.materialize(start, now)
print("Materialization finished")