Ejemplo n.º 1
0
def build_feature_store(s3_bucket: str, registry_path: str,
                        online_store_path: str) -> FeatureStore:
    feature_store_config = FeatureStoreConfig(
        project="horsecolic",
        s3_bucket=s3_bucket,
        registry_path=registry_path,
        online_store_path=online_store_path,
    )
    return FeatureStore(config=feature_store_config)
Ejemplo n.º 2
0
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to GCP
    print("Deploying feature store to GCP...")
    fs.apply([driver, driver_stats_fv])

    # Select features
    feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"]

    # Create an entity dataframe. This is the dataframe that will be enriched with historical features
    entity_df = pd.DataFrame(
        {
            "event_timestamp": [
                pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
                for dt in pd.date_range(
                    start=datetime.now() - timedelta(days=3),
                    end=datetime.now(),
                    periods=3,
                )
            ],
            "driver_id": [1001, 1002, 1003],
        }
    )

    print("Retrieving training data...")

    # Retrieve historical features by joining the entity dataframe to the BigQuery table source
    training_df = fs.get_historical_features(
        feature_refs=feature_refs, entity_df=entity_df
    ).to_df()

    print()
    print(training_df)

    print()
    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print()
    print("Retrieving online features...")

    # Retrieve features from the online store (Firestore)
    online_features = fs.get_online_features(
        feature_refs=feature_refs,
        entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}],
    ).to_dict()

    print()
    print(pd.DataFrame.from_dict(online_features))
Ejemplo n.º 3
0
def generate_saved_dataset():
    store = FeatureStore(repo_path=".")
    entity_df = pd.read_parquet(path="data/loan_table.parquet")

    fs = store.get_feature_service("credit_score_v1")
    job = store.get_historical_features(entity_df=entity_df, features=fs,)
    store.create_saved_dataset(
        from_=job,
        name="my_training_ds",
        storage=SavedDatasetFileStorage(path="my_training_ds.parquet"),
        feature_service=fs,
        profiler=credit_profiler,
    )
Ejemplo n.º 4
0
def test_exception_usage_on():
    old_environ = dict(os.environ)
    test_usage_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id
    os.environ["FEAST_IS_USAGE_TEST"] = "True"
    os.environ["FEAST_USAGE"] = "True"

    try:
        test_feature_store = FeatureStore("/tmp/non_existent_directory")
    except:
        pass

    os.environ.clear()
    os.environ.update(old_environ)
    ensure_bigquery_usage_id_with_retry(test_usage_id)
Ejemplo n.º 5
0
def run_demo():
    store = FeatureStore(repo_path=".")

    print("--- Historical features (from saved dataset) ---")
    ds = store.get_saved_dataset("my_training_ds")
    print(ds.to_df())

    print("\n--- Online features ---")
    features = store.get_online_features(
        features=store.get_feature_service("credit_score_v3"),
        entity_rows=[
            {"zipcode": 30721, "dob_ssn": "19530219_5179", "transaction_amt": 1023}
        ],
    ).to_dict()
    for key, value in sorted(features.items()):
        print(key, " : ", value)
Ejemplo n.º 6
0
def test_exception_usage_off():
    old_environ = dict(os.environ)
    test_usage_id = str(uuid.uuid4())
    os.environ["FEAST_IS_USAGE_TEST"] = "True"
    os.environ["FEAST_USAGE"] = "False"
    os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id

    try:
        test_feature_store = FeatureStore("/tmp/non_existent_directory")
    except:
        pass

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_usage_id(test_usage_id)
    assert rows.total_rows == 0
Ejemplo n.º 7
0
def get_historical_features():
    """Retrieve historical features for training."""
    # Entities to pull data for (should dynamically read this from somewhere)
    project_ids = [1, 2, 3]
    now = datetime.now()
    timestamps = [datetime(now.year, now.month, now.day)] * len(project_ids)
    entity_df = pd.DataFrame.from_dict({"id": project_ids, "event_timestamp": timestamps})

    # Get historical features
    store = FeatureStore(repo_path=Path(config.BASE_DIR, "features"))
    training_df = store.get_historical_features(
        entity_df=entity_df,
        feature_refs=["project_details:text", "project_details:tags"],
    ).to_df()

    # Store in location for training task to pick up
    print(training_df.head())
Ejemplo n.º 8
0
def test_telemetry_on():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "True"

    test_feature_store = FeatureStore()
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    test_feature_store.apply([entity])

    os.environ.clear()
    os.environ.update(old_environ)
    ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
Ejemplo n.º 9
0
def test_telemetry_off():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "False"
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id

    test_feature_store = FeatureStore()
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )
    test_feature_store.apply([entity])

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_telemetry_id(test_telemetry_id)
    assert rows.total_rows == 0
Ejemplo n.º 10
0
def setup_feature_store():
    """Prepares the local environment for a FeatureStore docstring test."""
    from datetime import datetime, timedelta

    from feast import Entity, FeatureStore, FeatureView, Field, FileSource, ValueType
    from feast.repo_operations import init_repo
    from feast.types import Float32, Int64

    init_repo("feature_repo", "local")
    fs = FeatureStore(repo_path="feature_repo")
    driver = Entity(
        name="driver_id",
        value_type=ValueType.INT64,
        description="driver id",
    )
    driver_hourly_stats = FileSource(
        path="feature_repo/data/driver_stats.parquet",
        timestamp_field="event_timestamp",
        created_timestamp_column="created",
    )
    driver_hourly_stats_view = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=86400 * 1),
        schema=[
            Field(name="conv_rate", dtype=Float32),
            Field(name="acc_rate", dtype=Float32),
            Field(name="avg_daily_trips", dtype=Int64),
        ],
        batch_source=driver_hourly_stats,
    )
    fs.apply([driver_hourly_stats_view, driver])
    fs.materialize(
        start_date=datetime.utcnow() - timedelta(hours=3),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
Ejemplo n.º 11
0
def construct_test_environment(
    test_repo_config: IntegrationTestRepoConfig,
    test_suite_name: str = "integration_test",
) -> Environment:
    project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}"

    offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(
        project)

    offline_store_config = offline_creator.create_offline_store_config()
    online_store = test_repo_config.online_store

    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=project,
            provider=test_repo_config.provider,
            offline_store=offline_store_config,
            online_store=online_store,
            repo_path=repo_dir_name,
        )
        fs = FeatureStore(config=config)
        # We need to initialize the registry, because if nothing is applied in the test before tearing down
        # the feature store, that will cause the teardown method to blow up.
        fs.registry._initialize_registry()
        environment = Environment(
            name=project,
            test_repo_config=test_repo_config,
            feature_store=fs,
            data_source_creator=offline_creator,
        )

        try:
            yield environment
        finally:
            fs.teardown()
Ejemplo n.º 12
0
from datetime import datetime, timedelta

import pandas as pd
from feast import FeatureStore
from joblib import dump
from sklearn.linear_model import LinearRegression

import helpers

# Load driver order data
orders = pd.read_csv("driver_orders.csv", sep="\t")
orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"])

# Set up feature store
fs = FeatureStore(repo_path="driver_ranking/")

# Retrieve training data from BigQuery
training_df = fs.get_historical_features(
    entity_df=orders,
    feature_refs=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

# # Print output
# print(training_df)

# Train model
target = "trip_completed"
Ejemplo n.º 13
0
def test_ge():
    store = FeatureStore(repo_path=".")

    print("--- Historical features (from saved dataset) ---")
    ds = store.get_saved_dataset("my_training_ds")
    print(ds._profile)
Ejemplo n.º 14
0
def test_online() -> None:
    """
    Test reading from the online store in local mode.
    """
    runner = CliRunner()
    with runner.local_repo(
            get_example_repo("example_feature_repo_1.py")) as store:
        # Write some data to two tables

        driver_locations_fv = store.get_feature_view(name="driver_locations")
        customer_profile_fv = store.get_feature_view(name="customer_profile")
        customer_driver_combined_fv = store.get_feature_view(
            name="customer_driver_combined")

        provider = store._get_provider()

        driver_key = EntityKeyProto(join_keys=["driver"],
                                    entity_values=[ValueProto(int64_val=1)])
        provider.online_write_batch(
            project=store.config.project,
            table=driver_locations_fv,
            data=[(
                driver_key,
                {
                    "lat": ValueProto(double_val=0.1),
                    "lon": ValueProto(string_val="1.0"),
                },
                datetime.utcnow(),
                datetime.utcnow(),
            )],
            progress=None,
        )

        customer_key = EntityKeyProto(join_keys=["customer"],
                                      entity_values=[ValueProto(int64_val=5)])
        provider.online_write_batch(
            project=store.config.project,
            table=customer_profile_fv,
            data=[(
                customer_key,
                {
                    "avg_orders_day": ValueProto(float_val=1.0),
                    "name": ValueProto(string_val="John"),
                    "age": ValueProto(int64_val=3),
                },
                datetime.utcnow(),
                datetime.utcnow(),
            )],
            progress=None,
        )

        customer_key = EntityKeyProto(
            join_keys=["customer", "driver"],
            entity_values=[ValueProto(int64_val=5),
                           ValueProto(int64_val=1)],
        )
        provider.online_write_batch(
            project=store.config.project,
            table=customer_driver_combined_fv,
            data=[(
                customer_key,
                {
                    "trips": ValueProto(int64_val=7)
                },
                datetime.utcnow(),
                datetime.utcnow(),
            )],
            progress=None,
        )

        # Retrieve two features using two keys, one valid one non-existing
        result = store.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }, {
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()

        assert "driver_locations__lon" in result
        assert "customer_profile__avg_orders_day" in result
        assert "customer_profile__name" in result
        assert result["driver"] == [1, 1]
        assert result["customer"] == [5, 5]
        assert result["driver_locations__lon"] == ["1.0", "1.0"]
        assert result["customer_profile__avg_orders_day"] == [1.0, 1.0]
        assert result["customer_profile__name"] == ["John", "John"]
        assert result["customer_driver_combined__trips"] == [7, 7]

        # Ensure features are still in result when keys not found
        result = store.get_online_features(
            feature_refs=["customer_driver_combined:trips"],
            entity_rows=[{
                "driver": 0,
                "customer": 0
            }],
        ).to_dict()

        assert "customer_driver_combined__trips" in result

        # invalid table reference
        with pytest.raises(FeatureViewNotFoundException):
            store.get_online_features(
                feature_refs=["driver_locations_bad:lon"],
                entity_rows=[{
                    "driver": 1
                }],
            )

        # Create new FeatureStore object with fast cache invalidation
        cache_ttl = 1
        fs_fast_ttl = FeatureStore(config=RepoConfig(
            registry=RegistryConfig(path=store.config.registry,
                                    cache_ttl_seconds=cache_ttl),
            online_store=store.config.online_store,
            project=store.config.project,
            provider=store.config.provider,
        ))

        # Should download the registry and cache it permanently (or until manually refreshed)
        result = fs_fast_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Rename the registry.db so that it cant be used for refreshes
        os.rename(store.config.registry, store.config.registry + "_fake")

        # Wait for registry to expire
        time.sleep(cache_ttl)

        # Will try to reload registry because it has expired (it will fail because we deleted the actual registry file)
        with pytest.raises(FileNotFoundError):
            fs_fast_ttl.get_online_features(
                feature_refs=[
                    "driver_locations:lon",
                    "customer_profile:avg_orders_day",
                    "customer_profile:name",
                    "customer_driver_combined:trips",
                ],
                entity_rows=[{
                    "driver": 1,
                    "customer": 5
                }],
            ).to_dict()

        # Restore registry.db so that we can see if it actually reloads registry
        os.rename(store.config.registry + "_fake", store.config.registry)

        # Test if registry is actually reloaded and whether results return
        result = fs_fast_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Create a registry with infinite cache (for users that want to manually refresh the registry)
        fs_infinite_ttl = FeatureStore(config=RepoConfig(
            registry=RegistryConfig(path=store.config.registry,
                                    cache_ttl_seconds=0),
            online_store=store.config.online_store,
            project=store.config.project,
            provider=store.config.provider,
        ))

        # Should return results (and fill the registry cache)
        result = fs_infinite_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Wait a bit so that an arbitrary TTL would take effect
        time.sleep(2)

        # Rename the registry.db so that it cant be used for refreshes
        os.rename(store.config.registry, store.config.registry + "_fake")

        # TTL is infinite so this method should use registry cache
        result = fs_infinite_ttl.get_online_features(
            feature_refs=[
                "driver_locations:lon",
                "customer_profile:avg_orders_day",
                "customer_profile:name",
                "customer_driver_combined:trips",
            ],
            entity_rows=[{
                "driver": 1,
                "customer": 5
            }],
        ).to_dict()
        assert result["driver_locations__lon"] == ["1.0"]
        assert result["customer_driver_combined__trips"] == [7]

        # Force registry reload (should fail because file is missing)
        with pytest.raises(FileNotFoundError):
            fs_infinite_ttl.refresh_registry()

        # Restore registry.db so that teardown works
        os.rename(store.config.registry + "_fake", store.config.registry)
Ejemplo n.º 15
0
raw_config = yaml.safe_load(config_string)
registry = raw_config["registry"]
registry_path = registry["path"] if isinstance(registry, dict) else registry
registry_store_class = get_registry_store_class_from_scheme(registry_path)
if registry_store_class == LocalRegistryStore and not os.path.exists(
        registry_path):
    registry_base64 = os.environ[REGISTRY_ENV_NAME]
    registry_bytes = base64.b64decode(registry_base64)
    registry_dir = os.path.dirname(registry_path)
    if not os.path.exists(repo_path / registry_dir):
        os.makedirs(repo_path / registry_dir)
    with open(repo_path / registry_path, "wb") as f:
        f.write(registry_bytes)

# Initialize the feature store
store = FeatureStore(repo_path=str(repo_path.resolve()))

if isinstance(registry, dict) and registry.get("cache_ttl_seconds", 0) > 0:
    # disable synchronous refresh
    store.config.registry.cache_ttl_seconds = 0

    # enable asynchronous refresh
    def async_refresh():
        store.refresh_registry()
        threading.Timer(registry["cache_ttl_seconds"], async_refresh).start()

    async_refresh()

# Start the feature transformation server
port = (os.environ.get(FEATURE_TRANSFORMATION_SERVER_PORT_ENV_NAME)
        or DEFAULT_FEATURE_TRANSFORMATION_SERVER_PORT)
Ejemplo n.º 16
0
    def __init__(self):
        # Load model
        self.model = load("driver_model.bin")

        # Set up feature store
        self.fs = FeatureStore(repo_path="driver_ranking/")
Ejemplo n.º 17
0
def construct_test_environment(
    test_repo_config: IntegrationTestRepoConfig,
    test_suite_name: str = "integration_test",
    worker_id: str = "worker_id",
    offline_container: Optional[DockerContainer] = None,
) -> Environment:
    _uuid = str(uuid.uuid4()).replace("-", "")[:6]

    run_id = os.getenv("GITHUB_RUN_ID", default=None)
    run_id = f"gh_run_{run_id}_{_uuid}" if run_id else _uuid
    run_num = os.getenv("GITHUB_RUN_NUMBER", default=1)

    project = f"{test_suite_name}_{run_id}_{run_num}"

    offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(
        project, offline_container=offline_container)
    offline_store_config = offline_creator.create_offline_store_config()

    if test_repo_config.online_store_creator:
        online_creator = test_repo_config.online_store_creator(project)
        online_store = (test_repo_config.online_store
                        ) = online_creator.create_online_store()
    else:
        online_creator = None
        online_store = test_repo_config.online_store

    repo_dir_name = tempfile.mkdtemp()

    if test_repo_config.python_feature_server and test_repo_config.provider == "aws":
        from feast.infra.feature_servers.aws_lambda.config import (
            AwsLambdaFeatureServerConfig, )

        feature_server = AwsLambdaFeatureServerConfig(
            enabled=True,
            execution_role_name=
            "arn:aws:iam::402087665549:role/lambda_execution_role",
        )

        registry = (
            f"s3://feast-integration-tests/registries/{project}/registry.db"
        )  # type: Union[str, RegistryConfig]
    else:
        # Note: even if it's a local feature server, the repo config does not have this configured
        feature_server = None
        registry = RegistryConfig(
            path=str(Path(repo_dir_name) / "registry.db"),
            cache_ttl_seconds=1,
        )
    config = RepoConfig(
        registry=registry,
        project=project,
        provider=test_repo_config.provider,
        offline_store=offline_store_config,
        online_store=online_store,
        repo_path=repo_dir_name,
        feature_server=feature_server,
        go_feature_retrieval=test_repo_config.go_feature_retrieval,
    )

    # Create feature_store.yaml out of the config
    with open(Path(repo_dir_name) / "feature_store.yaml", "w") as f:
        yaml.safe_dump(json.loads(config.json()), f)

    fs = FeatureStore(repo_dir_name)
    # We need to initialize the registry, because if nothing is applied in the test before tearing down
    # the feature store, that will cause the teardown method to blow up.
    fs.registry._initialize_registry()
    environment = Environment(
        name=project,
        test_repo_config=test_repo_config,
        feature_store=fs,
        data_source_creator=offline_creator,
        python_feature_server=test_repo_config.python_feature_server,
        worker_id=worker_id,
        online_store_creator=online_creator,
    )

    return environment
Ejemplo n.º 18
0
)

benchmark_feature_views = [
    FeatureView(
        name=f"feature_view_{i}",
        entities=["entity"],
        ttl=Duration(seconds=86400),
        features=[
            Feature(name=f"feature_{10 * i + j}", dtype=ValueType.INT64)
            for j in range(10)
        ],
        online=True,
        batch_source=generated_data_source,
    ) for i in range(25)
]

benchmark_feature_service = FeatureService(
    name=f"benchmark_feature_service",
    features=benchmark_feature_views,
)

fs = FeatureStore(".")
fs.apply([
    driver_hourly_stats_view, driver, entity, benchmark_feature_service,
    *benchmark_feature_views
])

now = datetime.now()
fs.materialize(start, now)
print("Materialization finished")
Ejemplo n.º 19
0
import time
from feast import FeatureStore, ValueType
import pandas as pd
from datetime import datetime

store = FeatureStore(repo_path="feast_repo")

startTime = time.time()
entity_df = pd.DataFrame.from_dict({
    "user_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    "event_timestamp": [
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
        datetime(2021, 4, 21, 17, 58, 9),
    ]
})
training_df = store.get_historical_features(
    entity_df=entity_df,
    feature_refs=[
        'driver_hourly_stats:daily_transactions',
        'driver_hourly_stats:total_transactions',
    ],
).to_df()
print(training_df)
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))
Ejemplo n.º 20
0
def start_test_local_server(repo_path: str, port: int):
    fs = FeatureStore(repo_path)
    fs.serve("localhost", port, no_access_log=True)
Ejemplo n.º 21
0
def test_universal_cli(test_repo_config) -> None:
    project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}"

    runner = CliRunner()

    with tempfile.TemporaryDirectory() as repo_dir_name:
        feature_store_yaml = make_feature_store_yaml(project, test_repo_config,
                                                     repo_dir_name)
        repo_path = Path(repo_dir_name)

        repo_config = repo_path / "feature_store.yaml"

        repo_config.write_text(dedent(feature_store_yaml))

        repo_example = repo_path / "example.py"
        repo_example.write_text(get_example_repo("example_feature_repo_1.py"))
        result = runner.run(["apply"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)

        # Store registry contents, to be compared later.
        fs = FeatureStore(repo_path=str(repo_path))
        registry_dict = fs.registry.to_dict(project=project)

        # entity & feature view list commands should succeed
        result = runner.run(["entities", "list"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
        result = runner.run(["feature-views", "list"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
        result = runner.run(["feature-services", "list"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)

        # entity & feature view describe commands should succeed when objects exist
        result = runner.run(["entities", "describe", "driver"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
        result = runner.run(["feature-views", "describe", "driver_locations"],
                            cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
        result = runner.run(
            ["feature-services", "describe", "driver_locations_service"],
            cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
        assertpy.assert_that(fs.list_feature_views()).is_length(3)

        # entity & feature view describe commands should fail when objects don't exist
        result = runner.run(["entities", "describe", "foo"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(1)
        result = runner.run(["feature-views", "describe", "foo"],
                            cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(1)
        result = runner.run(["feature-services", "describe", "foo"],
                            cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(1)

        # Doing another apply should be a no op, and should not cause errors
        result = runner.run(["apply"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
        basic_rw_test(
            FeatureStore(repo_path=str(repo_path), config=None),
            view_name="driver_locations",
        )

        # Confirm that registry contents have not changed.
        assertpy.assert_that(registry_dict).is_equal_to(
            fs.registry.to_dict(project=project))

        result = runner.run(["teardown"], cwd=repo_path)
        assertpy.assert_that(result.returncode).is_equal_to(0)
Ejemplo n.º 22
0
def construct_test_environment(
    test_repo_config: TestRepoConfig,
    create_and_apply: bool = False,
    materialize: bool = False,
) -> Environment:
    """
    This method should take in the parameters from the test repo config and created a feature repo, apply it,
    and return the constructed feature store object to callers.

    This feature store object can be interacted for the purposes of tests.
    The user is *not* expected to perform any clean up actions.

    :param test_repo_config: configuration
    :return: A feature store built using the supplied configuration.
    """
    df = create_dataset()

    project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}"

    module_name, config_class_name = test_repo_config.offline_store_creator.rsplit(
        ".", 1)

    offline_creator: DataSourceCreator = importer.get_class_from_type(
        module_name, config_class_name, "DataSourceCreator")(project)
    ds = offline_creator.create_data_source(project,
                                            df,
                                            field_mapping={
                                                "ts_1": "ts",
                                                "id": "driver_id"
                                            })
    offline_store = offline_creator.create_offline_store_config()
    online_store = test_repo_config.online_store

    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=project,
            provider=test_repo_config.provider,
            offline_store=offline_store,
            online_store=online_store,
            repo_path=repo_dir_name,
        )
        fs = FeatureStore(config=config)
        environment = Environment(
            name=project,
            test_repo_config=test_repo_config,
            feature_store=fs,
            data_source=ds,
            data_source_creator=offline_creator,
        )

        fvs = []
        entities = []
        try:
            if create_and_apply:
                entities.extend([driver(), customer()])
                fvs.extend([
                    environment.driver_stats_feature_view(),
                    environment.customer_feature_view(),
                ])
                fs.apply(fvs + entities)

            if materialize:
                fs.materialize(environment.start_date, environment.end_date)

            yield environment
        finally:
            offline_creator.teardown()
            fs.teardown()
Ejemplo n.º 23
0
# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 365),
    features=[
        Feature(name="conv_rate", dtype=ValueType.DOUBLE),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

fs = FeatureStore("")
fs.apply([driver_hourly_stats_view, driver])

now = datetime.now()
fs.materialize_incremental(now)
Ejemplo n.º 24
0
def test_universal_cli(environment: Environment):
    project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}"
    runner = CliRunner()

    with tempfile.TemporaryDirectory() as repo_dir_name:
        try:
            repo_path = Path(repo_dir_name)
            feature_store_yaml = make_feature_store_yaml(
                project, environment.test_repo_config, repo_path)

            repo_config = repo_path / "feature_store.yaml"

            repo_config.write_text(dedent(feature_store_yaml))

            repo_example = repo_path / "example.py"
            repo_example.write_text(
                get_example_repo("example_feature_repo_1.py"))
            result = runner.run(["apply"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)

            # Store registry contents, to be compared later.
            fs = FeatureStore(repo_path=str(repo_path))
            registry_dict = fs.registry.to_dict(project=project)
            # Save only the specs, not the metadata.
            registry_specs = {
                key: [fco["spec"] if "spec" in fco else fco for fco in value]
                for key, value in registry_dict.items()
            }

            # entity & feature view list commands should succeed
            result = runner.run(["entities", "list"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
            result = runner.run(["feature-views", "list"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
            result = runner.run(["feature-services", "list"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
            result = runner.run(["data-sources", "list"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)

            # entity & feature view describe commands should succeed when objects exist
            result = runner.run(["entities", "describe", "driver"],
                                cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
            result = runner.run(
                ["feature-views", "describe", "driver_locations"],
                cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
            result = runner.run(
                ["feature-services", "describe", "driver_locations_service"],
                cwd=repo_path,
            )
            assertpy.assert_that(result.returncode).is_equal_to(0)
            assertpy.assert_that(fs.list_feature_views()).is_length(4)
            result = runner.run(
                ["data-sources", "describe", "customer_profile_source"],
                cwd=repo_path,
            )
            assertpy.assert_that(result.returncode).is_equal_to(0)
            assertpy.assert_that(fs.list_data_sources()).is_length(4)

            # entity & feature view describe commands should fail when objects don't exist
            result = runner.run(["entities", "describe", "foo"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(1)
            result = runner.run(["feature-views", "describe", "foo"],
                                cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(1)
            result = runner.run(["feature-services", "describe", "foo"],
                                cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(1)
            result = runner.run(["data-sources", "describe", "foo"],
                                cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(1)

            # Doing another apply should be a no op, and should not cause errors
            result = runner.run(["apply"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
            basic_rw_test(
                FeatureStore(repo_path=str(repo_path), config=None),
                view_name="driver_locations",
            )

            # Confirm that registry contents have not changed.
            registry_dict = fs.registry.to_dict(project=project)
            assertpy.assert_that(registry_specs).is_equal_to({
                key: [fco["spec"] if "spec" in fco else fco for fco in value]
                for key, value in registry_dict.items()
            })

            result = runner.run(["teardown"], cwd=repo_path)
            assertpy.assert_that(result.returncode).is_equal_to(0)
        finally:
            runner.run(["teardown"], cwd=repo_path)
Ejemplo n.º 25
0
    def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
        super().__init__(ctx)

        self.source_config = config
        self.report = SourceReport()
        self.feature_store = FeatureStore(self.source_config.path)
Ejemplo n.º 26
0
from feast import FeatureStore, ValueType
import pandas as pd
from datetime import datetime

entity_df = pd.DataFrame.from_dict({
    "driver_id": [1001, 1002, 1003, 1004],
    "event_timestamp": [
        datetime(2021, 4, 12, 10, 59, 42),
        datetime(2021, 4, 12, 8,  12, 10),
        datetime(2021, 4, 12, 16, 40, 26),
        datetime(2021, 4, 12, 15, 1 , 12)
    ]
})

store = FeatureStore(repo_path="feast_repo")

training_df = store.get_historical_features(
    entity_df=entity_df, 
    feature_refs = [
        'driver_hourly_stats:conv_rate',
        'driver_hourly_stats:acc_rate',
        'driver_hourly_stats:avg_daily_trips'
    ],
).to_df()

print(training_df.head())

# another feature store

store = FeatureStore(repo_path="feature_transaction")