Ejemplo n.º 1
0
def test_partial() -> None:
    """
    Add another table to existing repo using partial apply API. Make sure both the table
    applied via CLI apply and the new table are passing RW test.
    """

    runner = CliRunner()
    with runner.local_repo(get_example_repo("example_feature_repo_1.py"),
                           "bigquery") as store:

        driver_locations_source = BigQuerySource(
            table="feast-oss.public.drivers",
            timestamp_field="event_timestamp",
            created_timestamp_column="created_timestamp",
        )

        driver_locations_100 = FeatureView(
            name="driver_locations_100",
            entities=["driver"],
            ttl=timedelta(days=1),
            schema=[
                Field(name="lat", dtype=Float32),
                Field(name="lon", dtype=String),
                Field(name="name", dtype=String),
            ],
            online=True,
            batch_source=driver_locations_source,
            tags={},
        )

        store.apply([driver_locations_100])

        basic_rw_test(store, view_name="driver_locations")
        basic_rw_test(store, view_name="driver_locations_100")
Ejemplo n.º 2
0
def test_get_column_names_preserves_feature_ordering():
    entity = Entity("my-entity",
                    description="My entity",
                    value_type=ValueType.STRING)
    fv = FeatureView(
        name="my-fv",
        entities=["my-entity"],
        ttl=timedelta(days=1),
        batch_source=BigQuerySource(table="non-existent-mock"),
        schema=[
            Field(name="a", dtype=String),
            Field(name="b", dtype=String),
            Field(name="c", dtype=String),
            Field(name="d", dtype=String),
            Field(name="e", dtype=String),
            Field(name="f", dtype=String),
            Field(name="g", dtype=String),
            Field(name="h", dtype=String),
            Field(name="i", dtype=String),
            Field(name="j", dtype=String),
        ],
    )

    _, feature_list, _, _ = _get_column_names(fv, [entity])
    assert feature_list == ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
Ejemplo n.º 3
0
def stage_entities_to_bq(entity_source: pd.DataFrame, project: str,
                         dataset: str) -> BigQuerySource:
    """
    Stores given (entity) dataframe as new table in BQ. Name of the table generated based on current time.
    Table will expire in 1 day.
    Returns BigQuerySource with reference to created table.
    """
    bq_client = bigquery.Client()
    destination = bigquery.TableReference(
        bigquery.DatasetReference(project, dataset),
        f"_entities_{datetime.now():%Y%m%d%H%M%s}",
    )

    # prevent casting ns -> ms exception inside pyarrow
    entity_source["event_timestamp"] = entity_source[
        "event_timestamp"].dt.floor("ms")

    load_job: bigquery.LoadJob = bq_client.load_table_from_dataframe(
        entity_source, destination)
    load_job.result()  # wait until complete

    dest_table: bigquery.Table = bq_client.get_table(destination)
    dest_table.expires = datetime.now() + timedelta(days=1)
    bq_client.update_table(dest_table, fields=["expires"])

    return BigQuerySource(
        event_timestamp_column="event_timestamp",
        table_ref=
        f"{destination.project}:{destination.dataset_id}.{destination.table_id}",
    )
Ejemplo n.º 4
0
    def create_data_source(
        self,
        destination: str,
        df: pd.DataFrame,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        field_mapping: Dict[str, str] = None,
        **kwargs,
    ) -> DataSource:

        job_config = bigquery.LoadJobConfig()
        if self.gcp_project not in destination:
            destination = f"{self.gcp_project}.{self.project_name}.{destination}"

        job = self.client.load_table_from_dataframe(df,
                                                    destination,
                                                    job_config=job_config)
        job.result()

        self.tables.append(destination)

        return BigQuerySource(
            table_ref=destination,
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            date_partition_column="",
            field_mapping=field_mapping or {"ts_1": "ts"},
        )
Ejemplo n.º 5
0
    def create_data_source(
        self,
        df: pd.DataFrame,
        destination_name: Optional[str] = None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        field_mapping: Dict[str, str] = None,
        **kwargs,
    ) -> DataSource:

        destination_name = self.get_prefixed_table_name(destination_name)

        self.create_dataset()

        if self.gcp_project not in destination_name:
            destination_name = (
                f"{self.gcp_project}.{self.project_name}.{destination_name}")

        job = self.client.load_table_from_dataframe(df, destination_name)
        job.result()

        self.tables.append(destination_name)

        return BigQuerySource(
            table_ref=destination_name,
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            date_partition_column="",
            field_mapping=field_mapping or {"ts_1": "ts"},
        )
Ejemplo n.º 6
0
def create_bq_view_of_joined_features_and_entities(
        source: BigQuerySource, entity_source: BigQuerySource,
        entity_names: List[str]) -> BigQuerySource:
    """
    Creates BQ view that joins tables from `source` and `entity_source` with join key derived from `entity_names`.
    Returns BigQuerySource with reference to created view.
    """
    bq_client = bigquery.Client()

    source_ref = table_reference_from_string(source.bigquery_options.table_ref)
    entities_ref = table_reference_from_string(
        entity_source.bigquery_options.table_ref)

    destination_ref = bigquery.TableReference(
        bigquery.DatasetReference(source_ref.project, source_ref.dataset_id),
        f"_view_{source_ref.table_id}_{datetime.now():%Y%m%d%H%M%s}",
    )

    view = bigquery.Table(destination_ref)
    view.view_query = JOIN_TEMPLATE.format(
        entities=entities_ref,
        source=source_ref,
        entity_key=" AND ".join(
            [f"source.{e} = entities.{e}" for e in entity_names]),
    )
    view.expires = datetime.now() + timedelta(days=1)
    bq_client.create_table(view)

    return BigQuerySource(
        event_timestamp_column=source.event_timestamp_column,
        created_timestamp_column=source.created_timestamp_column,
        table_ref=f"{view.project}:{view.dataset_id}.{view.table_id}",
        field_mapping=source.field_mapping,
        date_partition_column=source.date_partition_column,
    )
Ejemplo n.º 7
0
def simple_bq_source_using_query_arg(df, event_timestamp_column=None) -> BigQuerySource:
    bq_source_using_table = simple_bq_source_using_table_arg(df, event_timestamp_column)
    return BigQuerySource(
        name=bq_source_using_table.table,
        query=f"SELECT * FROM {bq_source_using_table.table}",
        timestamp_field=event_timestamp_column,
    )
Ejemplo n.º 8
0
    def test_basic(self) -> None:
        """
            Add another table to existing repo using partial apply API. Make sure both the table
            applied via CLI apply and the new table are passing RW test.
        """

        runner = CliRunner()
        with runner.local_repo(get_example_repo("example_feature_repo_1.py")) as store:

            driver_locations_source = BigQuerySource(
                table_ref="rh_prod.ride_hailing_co.drivers",
                event_timestamp_column="event_timestamp",
                created_timestamp_column="created_timestamp",
            )

            driver_locations_100 = FeatureView(
                name="driver_locations_100",
                entities=["driver"],
                ttl=Duration(seconds=86400 * 1),
                features=[
                    Feature(name="lat", dtype=ValueType.FLOAT),
                    Feature(name="lon", dtype=ValueType.STRING),
                    Feature(name="name", dtype=ValueType.STRING),
                ],
                online=True,
                input=driver_locations_source,
                tags={},
            )

            store.apply([driver_locations_100])

            basic_rw_test(store, view_name="driver_locations")
            basic_rw_test(store, view_name="driver_locations_100")
Ejemplo n.º 9
0
def simple_bq_source_using_query_arg(df, event_timestamp_column=None) -> BigQuerySource:
    bq_source_using_table_ref = simple_bq_source_using_table_ref_arg(
        df, event_timestamp_column
    )
    return BigQuerySource(
        query=f"SELECT * FROM {bq_source_using_table_ref.table_ref}",
        event_timestamp_column=event_timestamp_column,
    )
def prep_bq_fs_and_fv(
    bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = bigquery.Client()
    gcp_project = client.project
    bigquery_dataset = "test_ingestion"
    dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
    client.create_dataset(dataset, exists_ok=True)
    dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14
                                           )  # 2 weeks in milliseconds
    client.update_dataset(dataset, ["default_table_expiration_ms"])

    df = create_dataset()

    job_config = bigquery.LoadJobConfig()
    table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}"
    query = f"SELECT * FROM `{table_ref}`"
    job = client.load_table_from_dataframe(df,
                                           table_ref,
                                           job_config=job_config)
    job.result()

    bigquery_source = BigQuerySource(
        table_ref=table_ref if bq_source_type == "table" else None,
        query=query if bq_source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = driver_feature_view(bigquery_source)
    e = Entity(
        name="driver",
        description="id for driver",
        join_key="driver_id",
        value_type=ValueType.INT32,
    )
    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
            provider="gcp",
            online_store=DatastoreOnlineStoreConfig(
                namespace="integration_test"),
        )
        fs = FeatureStore(config=config)
        fs.apply([fv, e])

        yield fs, fv

        fs.teardown()
Ejemplo n.º 11
0
def simple_bq_source_using_table_arg(df, event_timestamp_column=None) -> BigQuerySource:
    client = bigquery.Client()
    gcp_project = client.project
    bigquery_dataset = f"ds_{time.time_ns()}"
    dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
    client.create_dataset(dataset, exists_ok=True)
    dataset.default_table_expiration_ms = (
        1000
        * 60
        * 60  # 60 minutes in milliseconds (seems to be minimum limit for gcloud)
    )
    client.update_dataset(dataset, ["default_table_expiration_ms"])
    table = f"{gcp_project}.{bigquery_dataset}.table_{random.randrange(100, 999)}"

    job = client.load_table_from_dataframe(df, table)
    job.result()

    return BigQuerySource(table=table, timestamp_field=event_timestamp_column,)
Ejemplo n.º 12
0
def batch_source(local_staging_path: str, pytestconfig,
                 request: FixtureRequest):
    if pytestconfig.getoption("env") == "gcloud":
        bq_project = pytestconfig.getoption("bq_project")
        bq_dataset = request.getfixturevalue("bq_dataset")
        return BigQuerySource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created_timestamp",
            table_ref=
            f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}",
        )
    else:
        return FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "transactions"),
        )
Ejemplo n.º 13
0
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset,
                                        feast_client: Client,
                                        feast_spark_client: SparkClient):
    original = generate_data()
    bq_project = pytestconfig.getoption("bq_project")

    bq_client = bigquery.Client(project=bq_project)
    source_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}",
    )
    bq_client.load_table_from_dataframe(original, source_ref).result()

    view_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}",
    )
    view = bigquery.Table(view_ref)
    view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`"
    bq_client.create_table(view)

    entity = Entity(name="s2id",
                    description="S2id",
                    value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name="bq_ingestion",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=BigQuerySource(
            event_timestamp_column="event_timestamp",
            table_ref=
            f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}",
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
Ejemplo n.º 14
0
def simple_bq_source_using_table_ref_arg(
    df, event_timestamp_column=None
) -> BigQuerySource:
    client = bigquery.Client()
    gcp_project = client.project
    bigquery_dataset = "ds"
    dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
    client.create_dataset(dataset, exists_ok=True)
    dataset.default_table_expiration_ms = (
        1000
        * 60
        * 60  # 60 minutes in milliseconds (seems to be minimum limit for gcloud)
    )
    client.update_dataset(dataset, ["default_table_expiration_ms"])
    table_ref = f"{gcp_project}.{bigquery_dataset}.table_1"

    job = client.load_table_from_dataframe(
        df, table_ref, job_config=bigquery.LoadJobConfig()
    )
    job.result()

    return BigQuerySource(
        table_ref=table_ref, event_timestamp_column=event_timestamp_column,
    )
from datetime import timedelta

from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType

driver = Entity(name="driver_id", join_key="driver_id", value_type=ValueType.INT64,)

driver_stats_source = BigQuerySource(
    table_ref="feast-oss.demo_data.driver_hourly_stats",
    event_timestamp_column="datetime",
    created_timestamp_column="created",
)

driver_stats_fv = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=timedelta(weeks=52),
    features=[
        Feature(name="conv_rate", dtype=ValueType.FLOAT),
        Feature(name="acc_rate", dtype=ValueType.FLOAT),
        Feature(name="avg_daily_trips", dtype=ValueType.INT64),
    ],
    input=driver_stats_source,
    tags={"team": "driver_performance"},
)
Ejemplo n.º 16
0
from datetime import timedelta

from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType

nonexistent_source = BigQuerySource(
    table_ref="project.dataset.nonexistent_table", event_timestamp_column=""
)

driver = Entity(name="driver", value_type=ValueType.INT64, description="driver id",)

nonexistent_features = FeatureView(
    name="driver_locations",
    entities=["driver"],
    ttl=timedelta(days=1),
    features=[
        Feature(name="lat", dtype=ValueType.FLOAT),
        Feature(name="lon", dtype=ValueType.STRING),
    ],
    input=nonexistent_source,
)
Ejemplo n.º 17
0
from datetime import timedelta

from feast import (
    BigQuerySource,
    Entity,
    Feature,
    FeatureService,
    FeatureView,
    ValueType,
)

driver_locations_source = BigQuerySource(
    table_ref="feast-oss.public.drivers",
    event_timestamp_column="event_timestamp",
    created_timestamp_column="created_timestamp",
)

customer_profile_source = BigQuerySource(
    table_ref="feast-oss.public.customers", event_timestamp_column="event_timestamp",
)

customer_driver_combined_source = BigQuerySource(
    table_ref="feast-oss.public.customer_driver",
    event_timestamp_column="event_timestamp",
)

driver = Entity(
    name="driver",  # The name is derived from this argument, not object name.
    value_type=ValueType.INT64,
    description="driver id",
)
Ejemplo n.º 18
0
    name="driver",
    # The join keys of an entity describe the storage level field/column on which
    # features can be looked up. The join keys are also used to join feature
    # tables/views when building feature vectors
    join_keys=["driver_id"],
    # The storage level type for an entity
    value_type=ValueType.INT64,
)

# Indicates a data source from which feature values can be retrieved. Sources are queried when building training
# datasets or materializing features into an online store.
driver_stats_source = BigQuerySource(
    # The BigQuery table where features can be found
    table="feast-oss.demo_data.driver_hourly_stats_2",
    # The event timestamp is used for point-in-time joins and for ensuring only
    # features within the TTL are returned
    timestamp_field="event_timestamp",
    # The (optional) created timestamp is used to ensure there are no duplicate
    # feature rows in the offline store or when building training datasets
    created_timestamp_column="created",
)

# Feature views are a grouping based on how features are stored in either the
# online or offline store.
driver_stats_fv = FeatureView(
    # The unique name of this feature view. Two feature views in a single
    # project cannot have the same name
    name="driver_hourly_stats",
    # The list of entities specifies the keys required for joining or looking
    # up features from this feature view. The reference provided in this field
    # correspond to the name of a defined entity (or entities)
    entities=["driver"],
Ejemplo n.º 19
0
def test_historical_features_from_bigquery_sources(provider_type,
                                                   infer_event_timestamp_col,
                                                   capsys, full_feature_names):
    start_date = datetime.now().replace(microsecond=0, second=0, minute=0)
    (
        customer_entities,
        driver_entities,
        end_date,
        orders_df,
        start_date,
    ) = generate_entities(start_date, infer_event_timestamp_col)

    bigquery_dataset = (
        f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}"
    )

    with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir:
        gcp_project = bigquery.Client().project

        # Orders Query
        table_id = f"{bigquery_dataset}.orders"
        stage_orders_bigquery(orders_df, table_id)
        entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}"

        # Driver Feature View
        driver_df = driver_data.create_driver_hourly_stats_df(
            driver_entities, start_date, end_date)
        driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly"
        stage_driver_hourly_stats_bigquery_source(driver_df, driver_table_id)
        driver_source = BigQuerySource(
            table_ref=driver_table_id,
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created",
        )
        driver_fv = create_driver_hourly_stats_feature_view(driver_source)

        # Customer Feature View
        customer_df = driver_data.create_customer_daily_profile_df(
            customer_entities, start_date, end_date)
        customer_table_id = f"{gcp_project}.{bigquery_dataset}.customer_profile"

        stage_customer_daily_profile_bigquery_source(customer_df,
                                                     customer_table_id)
        customer_source = BigQuerySource(
            table_ref=customer_table_id,
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created",
        )
        customer_fv = create_customer_daily_profile_feature_view(
            customer_source)

        driver = Entity(name="driver",
                        join_key="driver_id",
                        value_type=ValueType.INT64)
        customer = Entity(name="customer_id", value_type=ValueType.INT64)

        if provider_type == "local":
            store = FeatureStore(config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="default",
                provider="local",
                online_store=SqliteOnlineStoreConfig(path=os.path.join(
                    temp_dir, "online_store.db"), ),
                offline_store=BigQueryOfflineStoreConfig(
                    type="bigquery", dataset=bigquery_dataset),
            ))
        elif provider_type == "gcp":
            store = FeatureStore(config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="".join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=10)),
                provider="gcp",
                offline_store=BigQueryOfflineStoreConfig(
                    type="bigquery", dataset=bigquery_dataset),
            ))
        elif provider_type == "gcp_custom_offline_config":
            store = FeatureStore(config=RepoConfig(
                registry=os.path.join(temp_dir, "registry.db"),
                project="".join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=10)),
                provider="gcp",
                offline_store=BigQueryOfflineStoreConfig(type="bigquery",
                                                         dataset="foo"),
            ))
        else:
            raise Exception(
                "Invalid provider used as part of test configuration")

        store.apply([driver, customer, driver_fv, customer_fv])

        try:
            event_timestamp = (DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                               if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
                               in orders_df.columns else "e_ts")
            expected_df = get_expected_training_df(
                customer_df,
                customer_fv,
                driver_df,
                driver_fv,
                orders_df,
                event_timestamp,
                full_feature_names,
            )

            job_from_sql = store.get_historical_features(
                entity_df=entity_df_query,
                features=[
                    "driver_stats:conv_rate",
                    "driver_stats:avg_daily_trips",
                    "customer_profile:current_balance",
                    "customer_profile:avg_passenger_count",
                    "customer_profile:lifetime_trip_count",
                ],
                full_feature_names=full_feature_names,
            )

            start_time = datetime.utcnow()
            actual_df_from_sql_entities = job_from_sql.to_df()
            end_time = datetime.utcnow()
            with capsys.disabled():
                print(
                    str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'"
                        ))

            assert sorted(expected_df.columns) == sorted(
                actual_df_from_sql_entities.columns)
            assert_frame_equal(
                expected_df.sort_values(by=[
                    event_timestamp, "order_id", "driver_id", "customer_id"
                ]).reset_index(drop=True),
                actual_df_from_sql_entities[expected_df.columns].sort_values(
                    by=[
                        event_timestamp, "order_id", "driver_id", "customer_id"
                    ]).reset_index(drop=True),
                check_dtype=False,
            )

            table_from_sql_entities = job_from_sql.to_arrow()
            assert_frame_equal(actual_df_from_sql_entities,
                               table_from_sql_entities.to_pandas())

            timestamp_column = ("e_ts" if infer_event_timestamp_col else
                                DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL)

            entity_df_query_with_invalid_join_key = (
                f"select order_id, driver_id, customer_id as customer, "
                f"order_is_success, {timestamp_column}, FROM {gcp_project}.{table_id}"
            )
            # Rename the join key; this should now raise an error.
            assertpy.assert_that(store.get_historical_features).raises(
                errors.FeastEntityDFMissingColumnsError).when_called_with(
                    entity_df=entity_df_query_with_invalid_join_key,
                    features=[
                        "driver_stats:conv_rate",
                        "driver_stats:avg_daily_trips",
                        "customer_profile:current_balance",
                        "customer_profile:avg_passenger_count",
                        "customer_profile:lifetime_trip_count",
                    ],
                )

            job_from_df = store.get_historical_features(
                entity_df=orders_df,
                features=[
                    "driver_stats:conv_rate",
                    "driver_stats:avg_daily_trips",
                    "customer_profile:current_balance",
                    "customer_profile:avg_passenger_count",
                    "customer_profile:lifetime_trip_count",
                ],
                full_feature_names=full_feature_names,
            )

            # Rename the join key; this should now raise an error.
            orders_df_with_invalid_join_key = orders_df.rename(
                {"customer_id": "customer"}, axis="columns")
            assertpy.assert_that(store.get_historical_features).raises(
                errors.FeastEntityDFMissingColumnsError).when_called_with(
                    entity_df=orders_df_with_invalid_join_key,
                    features=[
                        "driver_stats:conv_rate",
                        "driver_stats:avg_daily_trips",
                        "customer_profile:current_balance",
                        "customer_profile:avg_passenger_count",
                        "customer_profile:lifetime_trip_count",
                    ],
                )

            # Make sure that custom dataset name is being used from the offline_store config
            if provider_type == "gcp_custom_offline_config":
                assertpy.assert_that(
                    job_from_df.query).contains("foo.feast_entity_df")
            else:
                assertpy.assert_that(job_from_df.query).contains(
                    f"{bigquery_dataset}.feast_entity_df")

            start_time = datetime.utcnow()
            actual_df_from_df_entities = job_from_df.to_df()
            end_time = datetime.utcnow()
            with capsys.disabled():
                print(
                    str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n"
                        ))

            assert sorted(expected_df.columns) == sorted(
                actual_df_from_df_entities.columns)
            assert_frame_equal(
                expected_df.sort_values(by=[
                    event_timestamp, "order_id", "driver_id", "customer_id"
                ]).reset_index(drop=True),
                actual_df_from_df_entities[expected_df.columns].sort_values(
                    by=[
                        event_timestamp, "order_id", "driver_id", "customer_id"
                    ]).reset_index(drop=True),
                check_dtype=False,
            )

            table_from_df_entities = job_from_df.to_arrow()
            assert_frame_equal(actual_df_from_df_entities,
                               table_from_df_entities.to_pandas())
        finally:
            store.teardown()
Ejemplo n.º 20
0
    # features can be looked up. The join key is also used to join feature
    # tables/views when building feature vectors
    join_key="CustomerID",
    # The storage level type for an entity
    value_type=ValueType.INT64,
)

# Indicates a data source from which feature values can be retrieved. Sources are queried when building training
# datasets or materializing features into an online store.
# Indicates a data source from which feature values can be retrieved. Sources are queried when building training
# datasets or materializing features into an online store.
transaction_stats = BigQuerySource(
    # The BigQuery table where features can be found
    table_ref="srivatsan-project.customer.transactions",
    # The event timestamp is used for point-in-time joins and for ensuring only
    # features within the TTL are returned
    event_timestamp_column="event_timestamp",
    # The (optional) created timestamp is used to ensure there are no duplicate
    # feature rows in the offline store or when building training datasets
    created_timestamp_column="created_timestamp",
)

#transaction_stats = FileSource(
#    path="/home/jupyter/transactions.parquet",
#    event_timestamp_column="event_timestamp",
#    created_timestamp_column="created_timestamp",
#)

# Feature views are a grouping based on how features are stored in either the
# online or offline store.
transaction_stats_fv = FeatureView(
    # The unique name of this feature view. Two feature views in a single
Ejemplo n.º 21
0
from datetime import timedelta

from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType

driver_locations_source = BigQuerySource(
    table_ref="rh_prod.ride_hailing_co.drivers",
    event_timestamp_column="event_timestamp",
    created_timestamp_column="created_timestamp",
)

customer_profile_source = BigQuerySource(
    table_ref="rh_prod.ride_hailing_co.customers",
    event_timestamp_column="event_timestamp",
)

customer_driver_combined_source = BigQuerySource(
    table_ref="rh_prod.ride_hailing_co.customer_driver",
    event_timestamp_column="event_timestamp",
)

driver = Entity(
    name="driver",  # The name is derived from this argument, not object name.
    value_type=ValueType.INT64,
    description="driver id",
)

customer = Entity(
    name="customer",  # The name is derived from this argument, not object name.
    value_type=ValueType.STRING,
)
Ejemplo n.º 22
0
from datetime import timedelta

from feast import (
    BigQuerySource,
    Entity,
    FeatureService,
    FeatureView,
    Field,
    PushSource,
    ValueType,
)
from feast.types import Float32, Int64, String

driver_locations_source = BigQuerySource(
    table="feast-oss.public.drivers",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp",
)

driver_locations_source_query = BigQuerySource(
    query="SELECT * from feast-oss.public.drivers",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp",
)

driver_locations_source_query_2 = BigQuerySource(
    query="SELECT lat * 2 FROM feast-oss.public.drivers",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp",
)
Ejemplo n.º 23
0
def test_historical_features_from_bigquery_sources_containing_backfills(
        capsys):
    now = datetime.now().replace(microsecond=0, second=0, minute=0)
    tomorrow = now + timedelta(days=1)

    entity_dataframe = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": now + timedelta(days=2)
        },
        {
            "driver_id": 1002,
            "event_timestamp": now + timedelta(days=2)
        },
    ])

    driver_stats_df = pd.DataFrame(data=[
        # Duplicated rows simple case
        {
            "driver_id": 1001,
            "avg_daily_trips": 10,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1001,
            "avg_daily_trips": 20,
            "event_timestamp": tomorrow,
            "created": tomorrow,
        },
        # Duplicated rows after a backfill
        {
            "driver_id": 1002,
            "avg_daily_trips": 30,
            "event_timestamp": now,
            "created": tomorrow,
        },
        {
            "driver_id": 1002,
            "avg_daily_trips": 40,
            "event_timestamp": tomorrow,
            "created": now,
        },
    ])

    expected_df = pd.DataFrame(data=[
        {
            "driver_id": 1001,
            "event_timestamp": now + timedelta(days=2),
            "avg_daily_trips": 20,
        },
        {
            "driver_id": 1002,
            "event_timestamp": now + timedelta(days=2),
            "avg_daily_trips": 40,
        },
    ])

    bigquery_dataset = (
        f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}"
    )

    with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir:
        gcp_project = bigquery.Client().project

        # Entity Dataframe SQL query
        table_id = f"{bigquery_dataset}.orders"
        stage_orders_bigquery(entity_dataframe, table_id)
        entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}"

        # Driver Feature View
        driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly"
        stage_driver_hourly_stats_bigquery_source(driver_stats_df,
                                                  driver_table_id)

        store = FeatureStore(config=RepoConfig(
            registry=os.path.join(temp_dir, "registry.db"),
            project="".join(
                random.choices(string.ascii_uppercase + string.digits, k=10)),
            provider="gcp",
            offline_store=BigQueryOfflineStoreConfig(type="bigquery",
                                                     dataset=bigquery_dataset),
        ))

        driver = Entity(name="driver",
                        join_key="driver_id",
                        value_type=ValueType.INT64)
        driver_fv = FeatureView(
            name="driver_stats",
            entities=["driver"],
            features=[Feature(name="avg_daily_trips", dtype=ValueType.INT32)],
            batch_source=BigQuerySource(
                table_ref=driver_table_id,
                event_timestamp_column="event_timestamp",
                created_timestamp_column="created",
            ),
            ttl=None,
        )

        store.apply([driver, driver_fv])

        try:
            job_from_sql = store.get_historical_features(
                entity_df=entity_df_query,
                features=["driver_stats:avg_daily_trips"],
                full_feature_names=False,
            )

            start_time = datetime.utcnow()
            actual_df_from_sql_entities = job_from_sql.to_df()
            end_time = datetime.utcnow()
            with capsys.disabled():
                print(
                    str(f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'"
                        ))

            assert sorted(expected_df.columns) == sorted(
                actual_df_from_sql_entities.columns)
            assert_frame_equal(
                expected_df.sort_values(by=["driver_id"]).reset_index(
                    drop=True),
                actual_df_from_sql_entities[expected_df.columns].sort_values(
                    by=["driver_id"]).reset_index(drop=True),
                check_dtype=False,
            )

        finally:
            store.teardown()