Esempio n. 1
0
def test_telemetry_on():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "True"

    with tempfile.TemporaryDirectory() as temp_dir:
        test_feature_store = FeatureStore(
            config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="fake_project",
                provider="local",
                online_store=SqliteOnlineStoreConfig(
                    path=os.path.join(temp_dir, "online.db")
                ),
            )
        )
        entity = Entity(
            name="driver_car_id",
            description="Car driver id",
            value_type=ValueType.STRING,
            labels={"team": "matchmaking"},
        )

        test_feature_store.apply([entity])

        os.environ.clear()
        os.environ.update(old_environ)
        ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
Esempio n. 2
0
def check_offline_and_online_features(
    fs: FeatureStore,
    fv: FeatureView,
    driver_id: int,
    event_timestamp: datetime,
    expected_value: Optional[float],
    full_feature_names: bool,
    check_offline_store: bool = True,
) -> None:
    # Check online store
    response_dict = fs.get_online_features(
        [f"{fv.name}:value"],
        [{
            "driver_id": driver_id
        }],
        full_feature_names=full_feature_names,
    ).to_dict()

    if full_feature_names:
        if expected_value:
            assert (
                abs(response_dict[f"{fv.name}__value"][0] - expected_value) <
                1e-6), f"Response: {response_dict}, Expected: {expected_value}"
        else:
            assert response_dict[f"{fv.name}__value"][0] is None
    else:
        if expected_value:
            assert (abs(response_dict["value"][0] - expected_value) < 1e-6
                    ), f"Response: {response_dict}, Expected: {expected_value}"
        else:
            assert response_dict["value"][0] is None

    # Check offline store
    if check_offline_store:
        df = fs.get_historical_features(
            entity_df=pd.DataFrame.from_dict({
                "driver_id": [driver_id],
                "event_timestamp": [event_timestamp]
            }),
            features=[f"{fv.name}:value"],
            full_feature_names=full_feature_names,
        ).to_df()

        if full_feature_names:
            if expected_value:
                assert (abs(
                    df.to_dict(orient="list")[f"{fv.name}__value"][0] -
                    expected_value) < 1e-6)
            else:
                assert not df.to_dict(
                    orient="list")[f"{fv.name}__value"] or math.isnan(
                        df.to_dict(orient="list")[f"{fv.name}__value"][0])
        else:
            if expected_value:
                assert (abs(
                    df.to_dict(orient="list")["value"][0] - expected_value) <
                        1e-6)
            else:
                assert not df.to_dict(orient="list")["value"] or math.isnan(
                    df.to_dict(orient="list")["value"][0])
Esempio n. 3
0
def get_latest_timestamps():
    store = FeatureStore(repo_path=".")
    feature_views = store.list_feature_views()
    for fv in feature_views:
        print(
            f"Data source latest event for {fv.name} is {fv.batch_source._meta.latest_event_timestamp}"
        )
Esempio n. 4
0
def construct_test_environment(
    test_repo_config: IntegrationTestRepoConfig,
    test_suite_name: str = "integration_test",
) -> Environment:
    project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}"

    offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project)

    offline_store_config = offline_creator.create_offline_store_config()
    online_store = test_repo_config.online_store

    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=project,
            provider=test_repo_config.provider,
            offline_store=offline_store_config,
            online_store=online_store,
            repo_path=repo_dir_name,
        )
        fs = FeatureStore(config=config)
        # We need to initialize the registry, because if nothing is applied in the test before tearing down
        # the feature store, that will cause the teardown method to blow up.
        fs.registry._initialize_registry()
        environment = Environment(
            name=project,
            test_repo_config=test_repo_config,
            feature_store=fs,
            data_source_creator=offline_creator,
        )

        try:
            yield environment
        finally:
            fs.teardown()
Esempio n. 5
0
def store_offline(feature_store: FeatureStore,
                  dataframe: FlyteSchema) -> FeatureStore:
    horse_colic_entity = Entity(name="Hospital Number",
                                value_type=ValueType.STRING)

    horse_colic_feature_view = FeatureView(
        name="horse_colic_stats",
        entities=["Hospital Number"],
        features=[
            Feature(name="rectal temperature", dtype=ValueType.FLOAT),
            Feature(name="total protein", dtype=ValueType.FLOAT),
            Feature(name="peripheral pulse", dtype=ValueType.FLOAT),
            Feature(name="surgical lesion", dtype=ValueType.STRING),
            Feature(name="abdominal distension", dtype=ValueType.FLOAT),
            Feature(name="nasogastric tube", dtype=ValueType.STRING),
            Feature(name="outcome", dtype=ValueType.STRING),
            Feature(name="packed cell volume", dtype=ValueType.FLOAT),
            Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT),
        ],
        batch_source=FileSource(
            path=str(dataframe.remote_path),
            event_timestamp_column="timestamp",
        ),
        ttl=timedelta(days=1),
    )

    # Ingest the data into feast
    feature_store.apply([horse_colic_entity, horse_colic_feature_view])

    return feature_store
Esempio n. 6
0
def test_usage_off():
    old_environ = dict(os.environ)
    test_usage_id = str(uuid.uuid4())
    os.environ["FEAST_IS_USAGE_TEST"] = "True"
    os.environ["FEAST_USAGE"] = "False"
    os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id

    with tempfile.TemporaryDirectory() as temp_dir:
        test_feature_store = FeatureStore(
            config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="fake_project",
                provider="local",
                online_store=SqliteOnlineStoreConfig(
                    path=os.path.join(temp_dir, "online.db")
                ),
            )
        )
        entity = Entity(
            name="driver_car_id",
            description="Car driver id",
            value_type=ValueType.STRING,
            labels={"team": "matchmaking"},
        )
        test_feature_store.apply([entity])

        os.environ.clear()
        os.environ.update(old_environ)
        sleep(30)
        rows = read_bigquery_usage_id(test_usage_id)
        assert rows.total_rows == 0
Esempio n. 7
0
def run_offline_online_store_consistency_test(fs: FeatureStore,
                                              fv: FeatureView) -> None:
    now = datetime.now()

    full_feature_names = True
    check_offline_store: bool = True

    # Run materialize()
    # use both tz-naive & tz-aware timestamps to test that they're both correctly handled
    start_date = (now - timedelta(hours=5)).replace(tzinfo=utc)
    end_date = now - timedelta(hours=2)
    fs.materialize(feature_views=[fv.name],
                   start_date=start_date,
                   end_date=end_date)

    # check result of materialize()
    check_offline_and_online_features(
        fs=fs,
        fv=fv,
        driver_id=1,
        event_timestamp=end_date,
        expected_value=0.3,
        full_feature_names=full_feature_names,
        check_offline_store=check_offline_store,
    )

    check_offline_and_online_features(
        fs=fs,
        fv=fv,
        driver_id=2,
        event_timestamp=end_date,
        expected_value=None,
        full_feature_names=full_feature_names,
        check_offline_store=check_offline_store,
    )

    # check prior value for materialize_incremental()
    check_offline_and_online_features(
        fs=fs,
        fv=fv,
        driver_id=3,
        event_timestamp=end_date,
        expected_value=4,
        full_feature_names=full_feature_names,
        check_offline_store=check_offline_store,
    )

    # run materialize_incremental()
    fs.materialize_incremental(feature_views=[fv.name], end_date=now)

    # check result of materialize_incremental()
    check_offline_and_online_features(
        fs=fs,
        fv=fv,
        driver_id=3,
        event_timestamp=now,
        expected_value=5,
        full_feature_names=full_feature_names,
        check_offline_store=check_offline_store,
    )
Esempio n. 8
0
 def init(self, conf: ConfigTree) -> None:
     conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG)
     self._feast_repository_path = conf.get_string(
         FeastExtractor.FEAST_REPOSITORY_PATH)
     self._describe_feature_views = conf.get_bool(
         FeastExtractor.DESCRIBE_FEATURE_VIEWS)
     self._feast = FeatureStore(repo_path=self._feast_repository_path)
     self._extract_iter: Union[None, Iterator] = None
Esempio n. 9
0
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame,
                            max_date: datetime):
    """Assert that features in online store are up to date with `max_date` date."""
    # Read features back
    result = store.get_online_features(
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:avg_daily_trips",
            "global_daily_stats:num_rides",
            "global_daily_stats:avg_ride_length",
        ],
        entity_rows=[{
            "driver_id": 1001
        }],
        full_feature_names=True,
    ).to_dict()

    assert len(result) == 5
    assert "driver_hourly_stats__avg_daily_trips" in result
    assert "driver_hourly_stats__conv_rate" in result
    assert (abs(result["driver_hourly_stats__conv_rate"][0] -
                _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"])
            < 0.01)
    assert "global_daily_stats__num_rides" in result
    assert "global_daily_stats__avg_ride_length" in result
Esempio n. 10
0
class DriverRankingModel:
    def __init__(self):
        # Load model
        self.model = load("driver_model.bin")

        # Set up feature store
        self.fs = FeatureStore(repo_path="driver_ranking/")

    def predict(self, driver_ids):
        # Read features from Feast
        driver_features = self.fs.get_online_features(
            entity_rows=[{
                "driver_id": driver_id
            } for driver_id in driver_ids],
            feature_refs=[
                "driver_hourly_stats:conv_rate",
                "driver_hourly_stats:acc_rate",
                "driver_hourly_stats:avg_daily_trips",
            ],
        )
        features_df = pd.DataFrame.from_dict(driver_features.to_dict())

        # Make prediction
        features_df["prediction"] = self.model.predict(features_df)

        # Choose best driver
        best_driver_id = features_df["driver_id"].iloc[
            features_df["prediction"].argmax()]

        # return best driver
        return best_driver_id
Esempio n. 11
0
def test__get_unique_entities():
    entity_values = {
        "entity_1": [Value(int64_val=1), Value(int64_val=2), Value(int64_val=1)],
        "entity_2": [
            Value(string_val="1"),
            Value(string_val="2"),
            Value(string_val="1"),
        ],
        "entity_3": [Value(int64_val=8), Value(int64_val=9), Value(int64_val=10)],
    }

    entity_name_to_join_key_map = {"entity_1": "entity_1", "entity_2": "entity_2"}

    fv = MockFeatureView(
        name="fv_1",
        entities=["entity_1", "entity_2"],
        projection=MockFeatureViewProjection(join_key_map={}),
    )

    unique_entities, indexes = FeatureStore._get_unique_entities(
        FeatureStore,
        table=fv,
        join_key_values=entity_values,
        entity_name_to_join_key_map=entity_name_to_join_key_map,
    )

    assert unique_entities == (
        {"entity_1": Value(int64_val=1), "entity_2": Value(string_val="1")},
        {"entity_1": Value(int64_val=2), "entity_2": Value(string_val="2")},
    )
    assert indexes == ([0, 2], [1])
Esempio n. 12
0
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame,
                            max_date: datetime):
    """Assert that features in online store are up to date with `max_date` date."""
    # Read features back
    response = store.get_online_features(
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:avg_daily_trips",
            "global_daily_stats:num_rides",
            "global_daily_stats:avg_ride_length",
        ],
        entity_rows=[{
            "driver_id": 1001
        }],
        full_feature_names=True,
    )

    # Float features should still be floats from the online store...
    assert (response.proto.results[list(
        response.proto.metadata.feature_names.val).index(
            "driver_hourly_stats__conv_rate")].values[0].float_val > 0)

    result = response.to_dict()
    assert len(result) == 5
    assert "driver_hourly_stats__avg_daily_trips" in result
    assert "driver_hourly_stats__conv_rate" in result
    assert (abs(result["driver_hourly_stats__conv_rate"][0] -
                _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"])
            < 0.01)
    assert "global_daily_stats__num_rides" in result
    assert "global_daily_stats__avg_ride_length" in result
Esempio n. 13
0
def run_demo():
    store = FeatureStore(repo_path=".")

    print("--- Historical features (from saved dataset) ---")
    ds = store.get_saved_dataset("my_training_ds")
    print(ds.to_df())

    print("\n--- Online features ---")
    features = store.get_online_features(
        features=store.get_feature_service("credit_score_v3"),
        entity_rows=[
            {"zipcode": 30721, "dob_ssn": "19530219_5179", "transaction_amt": 1023}
        ],
    ).to_dict()
    for key, value in sorted(features.items()):
        print(key, " : ", value)
Esempio n. 14
0
def retrieve_online(feature_store: FeatureStore,
                    dataset: pd.DataFrame) -> dict:
    inference_data = random.choice(dataset["Hospital Number"])
    logger.info(f"Hospital Number chosen for inference is: {inference_data}")
    entity_rows = [{"Hospital Number": inference_data}]

    return feature_store.get_online_features(FEAST_FEATURES, entity_rows)
Esempio n. 15
0
def load_historical_features(feature_store: FeatureStore) -> FlyteSchema:
    entity_df = pd.DataFrame.from_dict({
        "Hospital Number": [
            "530101",
            "5290409",
            "5291329",
            "530051",
            "529518",
            "530101",
            "529340",
            "5290409",
            "530034",
        ],
        "event_timestamp": [
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 7, 5, 11, 36, 1),
            datetime(2021, 6, 25, 16, 36, 27),
            datetime(2021, 7, 5, 11, 50, 40),
            datetime(2021, 6, 25, 16, 36, 27),
        ],
    })

    return feature_store.get_historical_features(
        entity_df=entity_df, features=FEAST_FEATURES)  # noqa
Esempio n. 16
0
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to Snowflake
    print("Deploying feature store to Snowflake...")
    fs.apply([driver, driver_stats_fv])

    # Select features
    features = [
        "driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"
    ]

    # Create an entity dataframe. This is the dataframe that will be enriched with historical features
    entity_df = pd.DataFrame({
        "event_timestamp": [
            pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
            for dt in pd.date_range(
                start=datetime.now() - timedelta(days=3),
                end=datetime.now(),
                periods=3,
            )
        ],
        "driver_id": [1001, 1002, 1003],
    })

    print("Retrieving training data...")

    # Retrieve historical features by joining the entity dataframe to the Snowflake table source
    training_df = fs.get_historical_features(features=features,
                                             entity_df=entity_df).to_df()

    print()
    print(training_df)

    print()
    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print()
    print("Retrieving online features...")

    # Retrieve features from the online store
    online_features = fs.get_online_features(
        features=features,
        entity_rows=[{
            "driver_id": 1001
        }, {
            "driver_id": 1002
        }],
    ).to_dict()

    print()
    print(pd.DataFrame.from_dict(online_features))
Esempio n. 17
0
def get_historical_features():
    """Retrieve historical features for training."""
    # Entities to pull data for (should dynamically read this from somewhere)
    project_ids = [1, 2, 3]
    now = datetime.now()
    timestamps = [datetime(now.year, now.month, now.day)] * len(project_ids)
    entity_df = pd.DataFrame.from_dict({"id": project_ids, "event_timestamp": timestamps})

    # Get historical features
    store = FeatureStore(repo_path=Path(config.BASE_DIR, "features"))
    training_df = store.get_historical_features(
        entity_df=entity_df,
        feature_refs=["project_details:text", "project_details:tags"],
    ).to_df()

    # Store in location for training task to pick up
    print(training_df.head())
Esempio n. 18
0
def main():
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 1000)

    # Load the feature store from the current path
    fs = FeatureStore(repo_path=".")

    # Deploy the feature store to AWS
    print("Deploying feature store to AWS...")
    fs.apply([driver, driver_hourly_stats_view])

    # Select features
    feature_refs = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"]

    print("Loading features into the online store...")
    fs.materialize_incremental(end_date=datetime.now())

    print("Retrieving online features...")

    # Retrieve features from the online store (DynamoDB)
    online_features = fs.get_online_features(
        feature_refs=feature_refs,
        entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}],
    ).to_dict()

    print(pd.DataFrame.from_dict(online_features))
Esempio n. 19
0
def build_feature_store(s3_bucket: str, registry_path: str,
                        online_store_path: str) -> FeatureStore:
    feature_store_config = FeatureStoreConfig(
        project="horsecolic",
        s3_bucket=s3_bucket,
        registry_path=registry_path,
        online_store_path=online_store_path,
    )
    return FeatureStore(config=feature_store_config)
Esempio n. 20
0
def test_telemetry_on():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "True"

    test_feature_store = FeatureStore()
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    test_feature_store.apply([entity])

    os.environ.clear()
    os.environ.update(old_environ)
    ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
Esempio n. 21
0
def test_telemetry_off():
    old_environ = dict(os.environ)
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    os.environ["FEAST_TELEMETRY"] = "False"
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id

    test_feature_store = FeatureStore()
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )
    test_feature_store.apply([entity])

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_telemetry_id(test_telemetry_id)
    assert rows.total_rows == 0
Esempio n. 22
0
def setup_feature_store():
    """Prepares the local environment for a FeatureStore docstring test."""
    from datetime import datetime, timedelta

    from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType
    from feast.repo_operations import init_repo

    init_repo("feature_repo", "local")
    fs = FeatureStore(repo_path="feature_repo")
    driver = Entity(
        name="driver_id",
        value_type=ValueType.INT64,
        description="driver id",
    )
    driver_hourly_stats = FileSource(
        path="feature_repo/data/driver_stats.parquet",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created",
    )
    driver_hourly_stats_view = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=86400 * 1),
        features=[
            Feature(name="conv_rate", dtype=ValueType.FLOAT),
            Feature(name="acc_rate", dtype=ValueType.FLOAT),
            Feature(name="avg_daily_trips", dtype=ValueType.INT64),
        ],
        batch_source=driver_hourly_stats,
    )
    fs.apply([driver_hourly_stats_view, driver])
    fs.materialize(
        start_date=datetime.utcnow() - timedelta(hours=3),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
Esempio n. 23
0
def test_exception_usage_on():
    old_environ = dict(os.environ)
    test_usage_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id
    os.environ["FEAST_IS_USAGE_TEST"] = "True"
    os.environ["FEAST_USAGE"] = "True"

    try:
        test_feature_store = FeatureStore("/tmp/non_existent_directory")
    except:
        pass

    os.environ.clear()
    os.environ.update(old_environ)
    ensure_bigquery_usage_id_with_retry(test_usage_id)
Esempio n. 24
0
def test_exception_usage_off():
    old_environ = dict(os.environ)
    test_usage_id = str(uuid.uuid4())
    os.environ["FEAST_IS_USAGE_TEST"] = "True"
    os.environ["FEAST_USAGE"] = "False"
    os.environ["FEAST_FORCE_USAGE_UUID"] = test_usage_id

    try:
        test_feature_store = FeatureStore("/tmp/non_existent_directory")
    except:
        pass

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_usage_id(test_usage_id)
    assert rows.total_rows == 0
Esempio n. 25
0
def generate_saved_dataset():
    store = FeatureStore(repo_path=".")
    entity_df = pd.read_parquet(path="data/loan_table.parquet")

    fs = store.get_feature_service("credit_score_v1")
    job = store.get_historical_features(entity_df=entity_df, features=fs,)
    store.create_saved_dataset(
        from_=job,
        name="my_training_ds",
        storage=SavedDatasetFileStorage(path="my_training_ds.parquet"),
        feature_service=fs,
        profiler=credit_profiler,
    )
Esempio n. 26
0
def _assert_online_features(store: FeatureStore, driver_df: pd.DataFrame,
                            max_date: datetime):
    """Assert that features in online store are up to date with `max_date` date."""
    # Read features back
    result = store.get_online_features(
        feature_refs=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:avg_daily_trips",
        ],
        entity_rows=[{
            "driver_id": 1001
        }],
    )

    assert "driver_hourly_stats__avg_daily_trips" in result.to_dict()

    assert "driver_hourly_stats__conv_rate" in result.to_dict()
    assert (abs(result.to_dict()["driver_hourly_stats__conv_rate"][0] -
                _get_last_feature_row(driver_df, 1001, max_date)["conv_rate"])
            < 0.01)
Esempio n. 27
0
def store_online(feature_store: FeatureStore) -> FeatureStore:
    feature_store.materialize(
        start_date=datetime.utcnow() - timedelta(days=250),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
    return feature_store
Esempio n. 28
0
class FeastRepositorySource(Source):
    """
    This plugin extracts:

    - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
    - Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
    - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
    - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
    - Column types associated with each entity and feature
    """

    source_config: FeastRepositorySourceConfig
    report: SourceReport
    feature_store: FeatureStore

    def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
        super().__init__(ctx)

        self.source_config = config
        self.report = SourceReport()
        self.feature_store = FeatureStore(self.source_config.path)

    def _get_field_type(self, field_type: ValueType, parent_name: str) -> str:
        """
        Maps types encountered in Feast to corresponding schema types.
        """

        ml_feature_data_type = _field_type_mapping.get(field_type)

        if ml_feature_data_type is None:
            self.report.report_warning(
                parent_name, f"unable to map type {field_type} to metadata schema"
            )

            ml_feature_data_type = MLFeatureDataType.UNKNOWN

        return ml_feature_data_type

    def _get_data_source_details(self, source: DataSource) -> Tuple[str, str]:
        """
        Get Feast batch/stream source platform and name.
        """

        platform = "unknown"
        name = "unknown"

        if isinstance(source, FileSource):
            platform = "file"

            name = source.path.replace("://", ".").replace("/", ".")

        if isinstance(source, BigQuerySource):
            platform = "bigquery"
            name = source.table

        if isinstance(source, KafkaSource):
            platform = "kafka"
            name = source.kafka_options.topic

        if isinstance(source, KinesisSource):
            platform = "kinesis"
            name = (
                f"{source.kinesis_options.region}:{source.kinesis_options.stream_name}"
            )

        if isinstance(source, RequestDataSource):
            platform = "request"
            name = source.name

        return platform, name

    def _get_data_sources(self, feature_view: FeatureView) -> List[str]:
        """
        Get data source URN list.
        """

        sources = []

        if feature_view.batch_source is not None:
            batch_source_platform, batch_source_name = self._get_data_source_details(
                feature_view.batch_source
            )
            sources.append(
                builder.make_dataset_urn(
                    batch_source_platform,
                    batch_source_name,
                    self.source_config.environment,
                )
            )

        if feature_view.stream_source is not None:
            stream_source_platform, stream_source_name = self._get_data_source_details(
                feature_view.stream_source
            )
            sources.append(
                builder.make_dataset_urn(
                    stream_source_platform,
                    stream_source_name,
                    self.source_config.environment,
                )
            )

        return sources

    def _get_entity_workunit(
        self, feature_view: FeatureView, entity: Entity
    ) -> MetadataWorkUnit:
        """
        Generate an MLPrimaryKey work unit for a Feast entity.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
            aspects=[StatusClass(removed=False)],
        )

        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=entity.description,
                dataType=self._get_field_type(entity.value_type, entity.name),
                sources=self._get_data_sources(feature_view),
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)

        return MetadataWorkUnit(id=entity.name, mce=mce)

    def _get_feature_workunit(
        self,
        feature_view: Union[FeatureView, OnDemandFeatureView],
        feature: Feature,
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeature work unit for a Feast feature.
        """
        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(feature_view_name, feature.name),
            aspects=[StatusClass(removed=False)],
        )

        feature_sources = []

        if isinstance(feature_view, FeatureView):
            feature_sources = self._get_data_sources(feature_view)
        elif isinstance(feature_view, OnDemandFeatureView):
            if feature_view.input_request_data_sources is not None:
                for request_source in feature_view.input_request_data_sources.values():
                    source_platform, source_name = self._get_data_source_details(
                        request_source
                    )

                    feature_sources.append(
                        builder.make_dataset_urn(
                            source_platform,
                            source_name,
                            self.source_config.environment,
                        )
                    )

            if feature_view.input_feature_view_projections is not None:
                for (
                    feature_view_projection
                ) in feature_view.input_feature_view_projections.values():
                    feature_view_source = self.feature_store.get_feature_view(
                        feature_view_projection.name
                    )

                    feature_sources.extend(self._get_data_sources(feature_view_source))

        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                description=feature.labels.get("description"),
                dataType=self._get_field_type(feature.dtype, feature.name),
                sources=feature_sources,
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)

        return MetadataWorkUnit(id=feature.name, mce=mce)

    def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable work unit for a Feast feature view.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_view_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
            aspects=[
                BrowsePathsClass(
                    paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"]
                ),
                StatusClass(removed=False),
            ],
        )

        feature_view_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        feature_view_name,
                        feature.name,
                    )
                    for feature in feature_view.features
                ],
                mlPrimaryKeys=[
                    builder.make_ml_primary_key_urn(feature_view_name, entity_name)
                    for entity_name in feature_view.entities
                ],
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot)

        return MetadataWorkUnit(id=feature_view_name, mce=mce)

    def _get_on_demand_feature_view_workunit(
        self, on_demand_feature_view: OnDemandFeatureView
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable work unit for a Feast on-demand feature view.
        """

        on_demand_feature_view_name = (
            f"{self.feature_store.project}.{on_demand_feature_view.name}"
        )

        on_demand_feature_view_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name),
            aspects=[
                BrowsePathsClass(
                    paths=[
                        f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}"
                    ]
                ),
                StatusClass(removed=False),
            ],
        )

        on_demand_feature_view_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        on_demand_feature_view_name,
                        feature.name,
                    )
                    for feature in on_demand_feature_view.features
                ],
                mlPrimaryKeys=[],
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot)

        return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)

    @classmethod
    def create(cls, config_dict, ctx):
        config = FeastRepositorySourceConfig.parse_obj(config_dict)
        return cls(config, ctx)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        for feature_view in self.feature_store.list_feature_views():
            for entity_name in feature_view.entities:
                entity = self.feature_store.get_entity(entity_name)

                work_unit = self._get_entity_workunit(feature_view, entity)
                self.report.report_workunit(work_unit)

                yield work_unit

            for feature in feature_view.features:
                work_unit = self._get_feature_workunit(feature_view, feature)
                self.report.report_workunit(work_unit)

                yield work_unit

            work_unit = self._get_feature_view_workunit(feature_view)
            self.report.report_workunit(work_unit)

            yield work_unit

        for on_demand_feature_view in self.feature_store.list_on_demand_feature_views():
            for feature in on_demand_feature_view.features:
                work_unit = self._get_feature_workunit(on_demand_feature_view, feature)
                self.report.report_workunit(work_unit)

                yield work_unit

            work_unit = self._get_on_demand_feature_view_workunit(
                on_demand_feature_view
            )
            self.report.report_workunit(work_unit)

            yield work_unit

    def get_report(self) -> SourceReport:
        return self.report

    def close(self) -> None:
        return
Esempio n. 29
0
    def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
        super().__init__(ctx)

        self.source_config = config
        self.report = SourceReport()
        self.feature_store = FeatureStore(self.source_config.path)
Esempio n. 30
0
# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(
    name="driver_id",
    value_type=ValueType.INT64,
    description="driver id",
)

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=Duration(seconds=86400 * 365),
    features=[
        Feature(name="conv_rate", dtype=ValueType.DOUBLE),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
    ],
    online=True,
    batch_source=driver_hourly_stats,
    tags={},
)

fs = FeatureStore("")
fs.apply([driver_hourly_stats_view, driver])

now = datetime.now()
fs.materialize_incremental(now)