Example #1
0
def test_apply_conflicting_featureview_names(
        feature_store_with_local_registry):
    """ Test applying feature views with non-case-insensitively unique names"""

    driver_stats = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=10),
        online=False,
        batch_source=FileSource(path="driver_stats.parquet"),
        tags={},
    )

    customer_stats = FeatureView(
        name="DRIVER_HOURLY_STATS",
        entities=["id"],
        ttl=timedelta(seconds=10),
        online=False,
        batch_source=FileSource(path="customer_stats.parquet"),
        tags={},
    )
    try:
        feature_store_with_local_registry.apply([driver_stats, customer_stats])
        error = None
    except ValueError as e:
        error = e
    assert (
        isinstance(error, ValueError) and
        "Please ensure that all feature view names are case-insensitively unique"
        in error.args[0])

    feature_store_with_local_registry.teardown()
Example #2
0
def test_apply_duplicated_featureview_names(feature_store_with_local_registry):
    """ Test applying feature views with duplicated names"""

    driver_stats = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=10),
        online=False,
        input=FileSource(path="driver_stats.parquet"),
        tags={},
    )

    customer_stats = FeatureView(
        name="driver_hourly_stats",
        entities=["id"],
        ttl=timedelta(seconds=10),
        online=False,
        input=FileSource(path="customer_stats.parquet"),
        tags={},
    )
    try:
        feature_store_with_local_registry.apply([driver_stats, customer_stats])
        error = None
    except ValueError as e:
        error = e
    assert (
        isinstance(error, ValueError)
        and "Please ensure that all feature view names are unique" in error.args[0]
    )

    feature_store_with_local_registry.teardown()
Example #3
0
def test_infer_datasource_names_file():
    file_path = "path/to/test.csv"
    data_source = FileSource(path=file_path)
    assert data_source.name == file_path

    source_name = "my_name"
    data_source = FileSource(name=source_name, path=file_path)
    assert data_source.name == source_name
Example #4
0
def test_apply_data_source(test_registry: Registry):
    # Create Feature Views
    batch_source = FileSource(
        name="test_source",
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register data source and feature view
    test_registry.apply_data_source(batch_source, project, commit=False)
    test_registry.apply_feature_view(fv1, project, commit=True)

    registry_feature_views = test_registry.list_feature_views(project)
    registry_data_sources = test_registry.list_data_sources(project)
    assert len(registry_feature_views) == 1
    assert len(registry_data_sources) == 1
    registry_feature_view = registry_feature_views[0]
    assert registry_feature_view.batch_source == batch_source
    registry_data_source = registry_data_sources[0]
    assert registry_data_source == batch_source

    # Check that change to batch source propagates
    batch_source.timestamp_field = "new_ts_col"
    test_registry.apply_data_source(batch_source, project, commit=False)
    test_registry.apply_feature_view(fv1, project, commit=True)
    registry_feature_views = test_registry.list_feature_views(project)
    registry_data_sources = test_registry.list_data_sources(project)
    assert len(registry_feature_views) == 1
    assert len(registry_data_sources) == 1
    registry_feature_view = registry_feature_views[0]
    assert registry_feature_view.batch_source == batch_source
    registry_batch_source = test_registry.list_data_sources(project)[0]
    assert registry_batch_source == batch_source

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Example #5
0
def store_offline(feature_store: FeatureStore,
                  dataframe: FlyteSchema) -> FeatureStore:
    horse_colic_entity = Entity(name="Hospital Number",
                                value_type=ValueType.STRING)

    horse_colic_feature_view = FeatureView(
        name="horse_colic_stats",
        entities=["Hospital Number"],
        features=[
            Feature(name="rectal temperature", dtype=ValueType.FLOAT),
            Feature(name="total protein", dtype=ValueType.FLOAT),
            Feature(name="peripheral pulse", dtype=ValueType.FLOAT),
            Feature(name="surgical lesion", dtype=ValueType.STRING),
            Feature(name="abdominal distension", dtype=ValueType.FLOAT),
            Feature(name="nasogastric tube", dtype=ValueType.STRING),
            Feature(name="outcome", dtype=ValueType.STRING),
            Feature(name="packed cell volume", dtype=ValueType.FLOAT),
            Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT),
        ],
        batch_source=FileSource(
            path=str(dataframe.remote_path),
            event_timestamp_column="timestamp",
        ),
        ttl=timedelta(days=1),
    )

    # Ingest the data into feast
    feature_store.apply([horse_colic_entity, horse_colic_feature_view])

    return feature_store
Example #6
0
def setup_feature_store():
    """Prepares the local environment for a FeatureStore docstring test."""
    from datetime import datetime, timedelta

    from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType
    from feast.repo_operations import init_repo

    init_repo("feature_repo", "local")
    fs = FeatureStore(repo_path="feature_repo")
    driver = Entity(
        name="driver_id",
        value_type=ValueType.INT64,
        description="driver id",
    )
    driver_hourly_stats = FileSource(
        path="feature_repo/data/driver_stats.parquet",
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created",
    )
    driver_hourly_stats_view = FeatureView(
        name="driver_hourly_stats",
        entities=["driver_id"],
        ttl=timedelta(seconds=86400 * 1),
        features=[
            Feature(name="conv_rate", dtype=ValueType.FLOAT),
            Feature(name="acc_rate", dtype=ValueType.FLOAT),
            Feature(name="avg_daily_trips", dtype=ValueType.INT64),
        ],
        batch_source=driver_hourly_stats,
    )
    fs.apply([driver_hourly_stats_view, driver])
    fs.materialize(
        start_date=datetime.utcnow() - timedelta(hours=3),
        end_date=datetime.utcnow() - timedelta(minutes=10),
    )
Example #7
0
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str,
                         config: Config) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme, config)
    with tempfile.NamedTemporaryFile() as df_export_path:
        # prevent casting ns -> ms exception inside pyarrow
        entity_source["event_timestamp"] = entity_source[
            "event_timestamp"].dt.floor("ms")

        entity_source.to_parquet(df_export_path.name)

        with open(df_export_path.name, "rb") as f:
            staging_client.upload_fileobj(f,
                                          df_export_path.name,
                                          remote_uri=entity_staging_uri)

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )
Example #8
0
    def create_data_source(
        self,
        df: pd.DataFrame,
        destination_name: str,
        timestamp_field="ts",
        created_timestamp_column="created_ts",
        field_mapping: Dict[str, str] = None,
    ) -> DataSource:

        destination_name = self.get_prefixed_table_name(destination_name)

        f = tempfile.NamedTemporaryFile(
            prefix=f"{self.project_name}_{destination_name}",
            suffix=".parquet",
            delete=False,
        )
        df.to_parquet(f.name)
        self.files.append(f)
        return FileSource(
            file_format=ParquetFormat(),
            path=f"{f.name}",
            timestamp_field=timestamp_field,
            created_timestamp_column=created_timestamp_column,
            field_mapping=field_mapping or {"ts_1": "ts"},
        )
Example #9
0
File: file.py Project: qooba/feast
    def create_data_source(
        self,
        df: pd.DataFrame,
        destination_name: Optional[str] = None,
        suffix: Optional[str] = None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        field_mapping: Dict[str, str] = None,
    ) -> DataSource:
        filename = f"{destination_name}.parquet"
        port = self.minio.get_exposed_port("9000")
        host = self.minio.get_container_host_ip()
        minio_endpoint = f"{host}:{port}"

        self._upload_parquet_file(df, filename, minio_endpoint)

        return FileSource(
            file_format=ParquetFormat(),
            path=f"s3://{self.bucket}/{filename}",
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            date_partition_column="",
            field_mapping=field_mapping or {"ts_1": "ts"},
            s3_endpoint_override=f"http://{host}:{port}",
        )
Example #10
0
def test_apply_feature_view_integration(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_feature_store.get_feature_view("my_feature_view_1")
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == String
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == Array(String)
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == Array(Bytes)
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_feature_store.delete_feature_view("my_feature_view_1")
    feature_views = test_feature_store.list_feature_views()
    assert len(feature_views) == 0

    test_feature_store.teardown()
Example #11
0
def test_apply_feature_view_integration(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == ValueType.INT64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == ValueType.STRING
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == ValueType.STRING_LIST
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_feature_store.get_feature_view("my_feature_view_1")
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == ValueType.INT64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == ValueType.STRING
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == ValueType.STRING_LIST
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == ValueType.BYTES_LIST
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_feature_store.delete_feature_view("my_feature_view_1")
    feature_views = test_feature_store.list_feature_views()
    assert len(feature_views) == 0

    test_feature_store.teardown()
def stage_customer_daily_profile_parquet_source(directory, df):
    customer_profile_path = os.path.join(directory, "customer_profile.parquet")
    df.to_parquet(path=customer_profile_path, allow_truncated_timestamps=True)
    return FileSource(
        path=customer_profile_path,
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created",
    )
Example #13
0
def test_apply_object_and_read(test_feature_store):
    assert isinstance(test_feature_store, FeatureStore)
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
    )

    e1 = Entity(name="fs1_my_entity_1",
                value_type=ValueType.STRING,
                description="something")

    e2 = Entity(name="fs1_my_entity_2",
                value_type=ValueType.STRING,
                description="something")

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    fv2 = FeatureView(
        name="my_feature_view_2",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1, e1, fv2, e2])

    fv1_actual = test_feature_store.get_feature_view("my_feature_view_1")
    e1_actual = test_feature_store.get_entity("fs1_my_entity_1")

    assert fv1 == fv1_actual
    assert e1 == e1_actual
    assert fv2 != fv1_actual
    assert e2 != e1_actual

    test_feature_store.teardown()
def transactions_feature_table(spark, client):
    schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_transactions", DoubleType()),
        StructField("is_vip", BooleanType()),
    ])
    df_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            50.0,
            True,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            100.0,
            True,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            400.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=1),
            200.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=4),
            datetime(year=2020, month=9, day=1),
            300.0,
            False,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "transactions",
                                                  schema, df_data)
    file_source = FileSource("event_timestamp", "created_timestamp",
                             ParquetFormat(), file_uri)
    features = [
        Feature("total_transactions", ValueType.DOUBLE),
        Feature("is_vip", ValueType.BOOL),
    ]
    feature_table = FeatureTable("transactions", ["customer_id"],
                                 features,
                                 batch_source=file_source)
    yield client.apply_feature_table(feature_table)
    shutil.rmtree(temp_dir)
def stage_driver_hourly_stats_parquet_source(directory, df):
    # Write to disk
    driver_stats_path = os.path.join(directory, "driver_stats.parquet")
    df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True)
    return FileSource(
        path=driver_stats_path,
        event_timestamp_column="event_timestamp",
        created_timestamp_column="",
    )
Example #16
0
def prep_file_source(df, event_timestamp_column=None) -> FileSource:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            path=f.name,
            event_timestamp_column=event_timestamp_column,
        )
        yield file_source
def test_offline_ingestion(feast_client: Client, staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            "event_timestamp",
            "event_timestamp",
            ParquetFormat(),
            os.path.join(staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    status = wait_retry_backoff(
        lambda:
        (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS),
        300)

    assert status == SparkJobStatus.COMPLETED

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
Example #18
0
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client,
                                       feast_spark_client: SparkClient):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    feature_table = FeatureTable(
        name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )
    feast_client.apply(entity)
    feast_client.apply(feature_table)

    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "0 0 * * *")
    config.load_incluster_config()
    k8s_api = client.CustomObjectsApi()

    def get_scheduled_spark_application():
        job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}".
                               encode()).hexdigest()
        resource_name = f"feast-{job_hash}"

        return k8s_api.get_namespaced_custom_object(
            group="sparkoperator.k8s.io",
            version="v1beta2",
            namespace=pytestconfig.getoption("k8s_namespace"),
            plural="scheduledsparkapplications",
            name=resource_name,
        )

    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "0 0 * * *"
    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "1 0 * * *")
    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "1 0 * * *"

    feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
Example #19
0
        def evaluate_offline_job():
            filesystem, path = FileSource.create_filesystem_and_path(
                data_source.path, data_source.file_options.s3_endpoint_override
            )
            source_df = pd.read_parquet(path, filesystem=filesystem)
            # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC
            source_df[event_timestamp_column] = source_df[event_timestamp_column].apply(
                lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)
            )
            if created_timestamp_column:
                source_df[created_timestamp_column] = source_df[
                    created_timestamp_column
                ].apply(
                    lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)
                )

            source_columns = set(source_df.columns)
            if not set(join_key_columns).issubset(source_columns):
                raise FeastJoinKeysDuringMaterialization(
                    data_source.path, set(join_key_columns), source_columns
                )

            ts_columns = (
                [event_timestamp_column, created_timestamp_column]
                if created_timestamp_column
                else [event_timestamp_column]
            )

            source_df.sort_values(by=ts_columns, inplace=True)

            filtered_df = source_df[
                (source_df[event_timestamp_column] >= start_date)
                & (source_df[event_timestamp_column] < end_date)
            ]

            columns_to_extract = set(
                join_key_columns + feature_name_columns + ts_columns
            )
            if join_key_columns:
                last_values_df = filtered_df.drop_duplicates(
                    join_key_columns, keep="last", ignore_index=True
                )
            else:
                last_values_df = filtered_df
                last_values_df[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL
                columns_to_extract.add(DUMMY_ENTITY_ID)

            return last_values_df[columns_to_extract]
Example #20
0
    def persist(self, storage: SavedDatasetStorage):
        assert isinstance(storage, SavedDatasetFileStorage)
        filesystem, path = FileSource.create_filesystem_and_path(
            storage.file_options.uri,
            storage.file_options.s3_endpoint_override,
        )

        if path.endswith(".parquet"):
            pyarrow.parquet.write_table(self.to_arrow(),
                                        where=path,
                                        filesystem=filesystem)
        else:
            # otherwise assume destination is directory
            pyarrow.parquet.write_to_dataset(self.to_arrow(),
                                             root_path=path,
                                             filesystem=filesystem)
Example #21
0
def bookings_feature_table_with_mapping(spark, client):
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("datetime", TimestampType()),
        StructField("created_datetime", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource(
        event_timestamp_column="datetime",
        created_timestamp_column="created_datetime",
        file_format=ParquetFormat(),
        file_url=file_uri,
        field_mapping={"id": "driver_id"},
    )
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply(feature_table)
    shutil.rmtree(temp_dir)
Example #22
0
def test_update_feature_views_with_inferred_features():
    file_source = FileSource(name="test", path="test path")
    entity1 = Entity(name="test1", join_keys=["test_column_1"])
    entity2 = Entity(name="test2", join_keys=["test_column_2"])
    feature_view_1 = FeatureView(
        name="test1",
        entities=[entity1],
        schema=[
            Field(name="feature", dtype=Float32),
            Field(name="test_column_1", dtype=String),
        ],
        source=file_source,
    )
    feature_view_2 = FeatureView(
        name="test2",
        entities=[entity1, entity2],
        schema=[
            Field(name="feature", dtype=Float32),
            Field(name="test_column_1", dtype=String),
            Field(name="test_column_2", dtype=String),
        ],
        source=file_source,
    )

    assert len(feature_view_1.schema) == 2
    assert len(feature_view_1.features) == 2

    # The entity field should be deleted from the schema and features of the feature view.
    update_feature_views_with_inferred_features([feature_view_1], [entity1],
                                                RepoConfig(provider="local",
                                                           project="test"))
    assert len(feature_view_1.schema) == 1
    assert len(feature_view_1.features) == 1

    assert len(feature_view_2.schema) == 3
    assert len(feature_view_2.features) == 3

    # The entity fields should be deleted from the schema and features of the feature view.
    update_feature_views_with_inferred_features(
        [feature_view_2],
        [entity1, entity2],
        RepoConfig(provider="local", project="test"),
    )
    assert len(feature_view_2.schema) == 1
    assert len(feature_view_2.features) == 1
Example #23
0
def batch_source(local_staging_path: str, pytestconfig,
                 request: FixtureRequest):
    if pytestconfig.getoption("env") == "gcloud":
        bq_project = pytestconfig.getoption("bq_project")
        bq_dataset = request.getfixturevalue("bq_dataset")
        return BigQuerySource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created_timestamp",
            table_ref=
            f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}",
        )
    else:
        return FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "transactions"),
        )
Example #24
0
 def create_data_source(
     self,
     destination: str,
     df: pd.DataFrame,
     event_timestamp_column="ts",
     created_timestamp_column="created_ts",
     field_mapping: Dict[str, str] = None,
 ) -> DataSource:
     self.f = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
     df.to_parquet(self.f.name)
     return FileSource(
         file_format=ParquetFormat(),
         path=f"file://{self.f.name}",
         event_timestamp_column=event_timestamp_column,
         created_timestamp_column=created_timestamp_column,
         date_partition_column="",
         field_mapping=field_mapping or {"ts_1": "ts"},
     )
Example #25
0
def create_schema(kafka_broker, topic_name, feature_table_name):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name=feature_table_name,
        entities=["key"],
        features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    return entity, feature_table
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            path=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={
                "ts_1": "ts",
                "id": "driver_id"
            },
        )
        fv = driver_feature_view(file_source)
        e = Entity(
            name="driver",
            description="id for driver",
            join_key="driver_id",
            value_type=ValueType.INT32,
        )
        project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}"
        print(f"Using project: {project}")
        with tempfile.TemporaryDirectory() as repo_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=project,
                provider="local",
                online_store=RedisOnlineStoreConfig(
                    type="redis",
                    redis_type=RedisType.redis,
                    connection_string="localhost:6379,db=0",
                ),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv, e])

            yield fs, fv

            fs.teardown()
Example #27
0
def test_historical_feature_retrieval_with_field_mappings_from_local_spark_session(
    spark,
    client,
    driver_entity,
    bookings_feature_table_with_mapping,
):
    schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
    ])
    df_data = [
        (8001, datetime(year=2020, month=9, day=1, tzinfo=utc)),
        (8001, datetime(year=2020, month=9, day=2, tzinfo=utc)),
        (8002, datetime(year=2020, month=9, day=1, tzinfo=utc)),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "drivers", schema,
                                                  df_data)
    entity_source = FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=file_uri,
    )
    joined_df = client.get_historical_features_df(
        ["bookings:total_completed_bookings"],
        entity_source,
    )
    expected_joined_df_schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("bookings__total_completed_bookings", IntegerType()),
    ])
    expected_joined_df_data = [
        (8001, datetime(year=2020, month=9, day=1, tzinfo=utc), 100),
        (8001, datetime(year=2020, month=9, day=2, tzinfo=utc), 150),
        (8002, datetime(year=2020, month=9, day=1, tzinfo=utc), None),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_df_data),
        expected_joined_df_schema,
    )
    assert_dataframe_equal(joined_df, expected_joined_df)
    shutil.rmtree(temp_dir)
def bookings_feature_table(spark, client):
    schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=2),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=2),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource("event_timestamp", "created_timestamp", "parquet",
                             file_uri)
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply_feature_table(feature_table)
    shutil.rmtree(temp_dir)
Example #29
0
def test_apply_feature_view_integration(test_registry):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register Feature View
    test_registry.apply_feature_view(fv1, project)

    feature_views = test_registry.list_feature_views(project)

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == ValueType.INT64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == ValueType.STRING
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == ValueType.STRING_LIST
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_registry.get_feature_view("my_feature_view_1", project)
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == ValueType.INT64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == ValueType.STRING
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == ValueType.STRING_LIST
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == ValueType.BYTES_LIST
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_registry.delete_feature_view("my_feature_view_1", project)
    feature_views = test_registry.list_feature_views(project)
    assert len(feature_views) == 0

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
Example #30
0
        def evaluate_historical_retrieval():

            # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC
            entity_df[entity_df_event_timestamp_col] = entity_df[
                entity_df_event_timestamp_col].apply(
                    lambda x: x
                    if x.tzinfo is not None else x.replace(tzinfo=pytz.utc))

            # Create a copy of entity_df to prevent modifying the original
            entity_df_with_features = entity_df.copy()

            # Convert event timestamp column to datetime and normalize time zone to UTC
            # This is necessary to avoid issues with pd.merge_asof
            entity_df_with_features[
                entity_df_event_timestamp_col] = pd.to_datetime(
                    entity_df_with_features[entity_df_event_timestamp_col],
                    utc=True)

            # Sort event timestamp values
            entity_df_with_features = entity_df_with_features.sort_values(
                entity_df_event_timestamp_col)

            # Load feature view data from sources and join them incrementally
            for feature_view, features in feature_views_to_features.items():
                event_timestamp_column = (
                    feature_view.batch_source.event_timestamp_column)
                created_timestamp_column = (
                    feature_view.batch_source.created_timestamp_column)

                # Read offline parquet data in pyarrow format.
                filesystem, path = FileSource.create_filesystem_and_path(
                    feature_view.batch_source.path,
                    feature_view.batch_source.file_options.
                    s3_endpoint_override,
                )
                table = pyarrow.parquet.read_table(path, filesystem=filesystem)

                # Rename columns by the field mapping dictionary if it exists
                if feature_view.batch_source.field_mapping is not None:
                    table = _run_field_mapping(
                        table, feature_view.batch_source.field_mapping)
                # Rename entity columns by the join_key_map dictionary if it exists
                if feature_view.projection.join_key_map:
                    table = _run_field_mapping(
                        table, feature_view.projection.join_key_map)

                # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values,
                # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean
                # If the dtype is 'object', then missing values are inferred as python `None`s.
                # More details at:
                # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
                df_to_join = table.to_pandas()

                # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC
                df_to_join[event_timestamp_column] = df_to_join[
                    event_timestamp_column].apply(lambda x: x
                                                  if x.tzinfo is not None else
                                                  x.replace(tzinfo=pytz.utc))
                if created_timestamp_column:
                    df_to_join[created_timestamp_column] = df_to_join[
                        created_timestamp_column].apply(
                            lambda x: x if x.tzinfo is not None else x.replace(
                                tzinfo=pytz.utc))

                # Sort dataframe by the event timestamp column
                df_to_join = df_to_join.sort_values(event_timestamp_column)

                # Build a list of all the features we should select from this source
                feature_names = []
                for feature in features:
                    # Modify the separator for feature refs in column names to double underscore. We are using
                    # double underscore as separator for consistency with other databases like BigQuery,
                    # where there are very few characters available for use as separators
                    if full_feature_names:
                        formatted_feature_name = (
                            f"{feature_view.projection.name_to_use()}__{feature}"
                        )
                    else:
                        formatted_feature_name = feature
                    # Add the feature name to the list of columns
                    feature_names.append(formatted_feature_name)

                    # Ensure that the source dataframe feature column includes the feature view name as a prefix
                    df_to_join.rename(
                        columns={feature: formatted_feature_name},
                        inplace=True,
                    )

                # Build a list of entity columns to join on (from the right table)
                join_keys = []
                for entity_name in feature_view.entities:
                    entity = registry.get_entity(entity_name, project)
                    join_key = feature_view.projection.join_key_map.get(
                        entity.join_key, entity.join_key)
                    join_keys.append(join_key)
                right_entity_columns = join_keys
                right_entity_key_columns = [event_timestamp_column
                                            ] + right_entity_columns

                # Remove all duplicate entity keys (using created timestamp)
                right_entity_key_sort_columns = right_entity_key_columns
                if created_timestamp_column:
                    # If created_timestamp is available, use it to dedupe deterministically
                    right_entity_key_sort_columns = right_entity_key_sort_columns + [
                        created_timestamp_column
                    ]

                df_to_join.sort_values(by=right_entity_key_sort_columns,
                                       inplace=True)
                df_to_join.drop_duplicates(
                    right_entity_key_sort_columns,
                    keep="last",
                    ignore_index=True,
                    inplace=True,
                )

                # Select only the columns we need to join from the feature dataframe
                df_to_join = df_to_join[right_entity_key_columns +
                                        feature_names]

                # Do point in-time-join between entity_df and feature dataframe
                entity_df_with_features = pd.merge_asof(
                    entity_df_with_features,
                    df_to_join,
                    left_on=entity_df_event_timestamp_col,
                    right_on=event_timestamp_column,
                    by=right_entity_columns or None,
                    tolerance=feature_view.ttl,
                )

                # Remove right (feature table/view) event_timestamp column.
                if event_timestamp_column != entity_df_event_timestamp_col:
                    entity_df_with_features.drop(
                        columns=[event_timestamp_column], inplace=True)

                # Ensure that we delete dataframes to free up memory
                del df_to_join

            # Move "event_timestamp" column to front
            current_cols = entity_df_with_features.columns.tolist()
            current_cols.remove(entity_df_event_timestamp_col)
            entity_df_with_features = entity_df_with_features[
                [entity_df_event_timestamp_col] + current_cols]

            return entity_df_with_features