Exemple #1
0
def get_fixtures(request):
    config: TypeTestConfig = request.param
    # Lower case needed because Redshift lower-cases all table names
    test_project_id = f"{config.entity_type}{config.feature_dtype}{config.feature_is_list}".replace(
        ".", "").lower()
    type_test_environment = construct_test_environment(
        test_repo_config=config.test_repo_config,
        test_suite_name=f"test_{test_project_id}",
    )
    config = request.param
    df = create_dataset(
        config.entity_type,
        config.feature_dtype,
        config.feature_is_list,
        config.has_empty_list,
    )
    data_source = type_test_environment.data_source_creator.create_data_source(
        df,
        destination_name=type_test_environment.feature_store.project,
        field_mapping={"ts_1": "ts"},
    )
    fv = create_feature_view(
        config.feature_dtype,
        config.feature_is_list,
        config.has_empty_list,
        data_source,
    )

    def cleanup():
        type_test_environment.data_source_creator.teardown()
        type_test_environment.feature_store.teardown()

    request.addfinalizer(cleanup)

    return type_test_environment, config, data_source, fv
def prep_bq_fs_and_fv(
    bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = bigquery.Client()
    gcp_project = client.project
    bigquery_dataset = "test_ingestion"
    dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
    client.create_dataset(dataset, exists_ok=True)
    dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14
                                           )  # 2 weeks in milliseconds
    client.update_dataset(dataset, ["default_table_expiration_ms"])

    df = create_dataset()

    job_config = bigquery.LoadJobConfig()
    table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}"
    query = f"SELECT * FROM `{table_ref}`"
    job = client.load_table_from_dataframe(df,
                                           table_ref,
                                           job_config=job_config)
    job.result()

    bigquery_source = BigQuerySource(
        table_ref=table_ref if bq_source_type == "table" else None,
        query=query if bq_source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = driver_feature_view(bigquery_source)
    e = Entity(
        name="driver",
        description="id for driver",
        join_key="driver_id",
        value_type=ValueType.INT32,
    )
    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
            provider="gcp",
            online_store=DatastoreOnlineStoreConfig(
                namespace="integration_test"),
        )
        fs = FeatureStore(config=config)
        fs.apply([fv, e])

        yield fs, fv

        fs.teardown()
Exemple #3
0
def e2e_data_sources(environment: Environment):
    df = create_dataset()
    data_source = environment.data_source_creator.create_data_source(
        df,
        environment.feature_store.project,
        field_mapping={"ts_1": "ts"},
    )

    yield df, data_source

    environment.data_source_creator.teardown()
Exemple #4
0
def e2e_data_sources(environment: Environment, request):
    df = create_dataset()
    data_source = environment.data_source_creator.create_data_source(
        df,
        environment.feature_store.project,
        field_mapping={"ts_1": "ts"},
    )

    def cleanup():
        environment.data_source_creator.teardown()

    request.addfinalizer(cleanup)

    return df, data_source
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            path=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={
                "ts_1": "ts",
                "id": "driver_id"
            },
        )
        fv = driver_feature_view(file_source)
        e = Entity(
            name="driver",
            description="id for driver",
            join_key="driver_id",
            value_type=ValueType.INT32,
        )
        project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}"
        print(f"Using project: {project}")
        with tempfile.TemporaryDirectory() as repo_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=project,
                provider="local",
                online_store=RedisOnlineStoreConfig(
                    type="redis",
                    redis_type=RedisType.redis,
                    connection_string="localhost:6379,db=0",
                ),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv, e])

            yield fs, fv

            fs.teardown()
def prep_redshift_fs_and_fv(
    source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = aws_utils.get_redshift_data_client("us-west-2")
    s3 = aws_utils.get_s3_resource("us-west-2")

    df = create_dataset()

    table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}"

    offline_store = RedshiftOfflineStoreConfig(
        cluster_id="feast-integration-tests",
        region="us-west-2",
        user="******",
        database="feast",
        s3_staging_location=
        "s3://feast-integration-tests/redshift/tests/ingestion",
        iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role",
    )

    aws_utils.upload_df_to_redshift(
        client,
        offline_store.cluster_id,
        offline_store.database,
        offline_store.user,
        s3,
        f"{offline_store.s3_staging_location}/copy/{table_name}.parquet",
        offline_store.iam_role,
        table_name,
        df,
    )

    redshift_source = RedshiftSource(
        table=table_name if source_type == "table" else None,
        query=f"SELECT * FROM {table_name}"
        if source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = driver_feature_view(redshift_source)
    e = Entity(
        name="driver",
        description="id for driver",
        join_key="driver_id",
        value_type=ValueType.INT32,
    )
    with tempfile.TemporaryDirectory(
    ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
            provider="local",
            online_store=SqliteOnlineStoreConfig(
                path=str(Path(data_dir_name) / "online_store.db")),
            offline_store=offline_store,
        )
        fs = FeatureStore(config=config)
        fs.apply([fv, e])

        yield fs, fv

        fs.teardown()

    # Clean up the uploaded Redshift table
    aws_utils.execute_redshift_statement(
        client,
        offline_store.cluster_id,
        offline_store.database,
        offline_store.user,
        f"DROP TABLE {table_name}",
    )
Exemple #7
0
def construct_test_environment(
    test_repo_config: TestRepoConfig,
    create_and_apply: bool = False,
    materialize: bool = False,
) -> Environment:
    """
    This method should take in the parameters from the test repo config and created a feature repo, apply it,
    and return the constructed feature store object to callers.

    This feature store object can be interacted for the purposes of tests.
    The user is *not* expected to perform any clean up actions.

    :param test_repo_config: configuration
    :return: A feature store built using the supplied configuration.
    """
    df = create_dataset()

    project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}"

    module_name, config_class_name = test_repo_config.offline_store_creator.rsplit(
        ".", 1)

    offline_creator: DataSourceCreator = importer.get_class_from_type(
        module_name, config_class_name, "DataSourceCreator")(project)
    ds = offline_creator.create_data_source(project,
                                            df,
                                            field_mapping={
                                                "ts_1": "ts",
                                                "id": "driver_id"
                                            })
    offline_store = offline_creator.create_offline_store_config()
    online_store = test_repo_config.online_store

    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=project,
            provider=test_repo_config.provider,
            offline_store=offline_store,
            online_store=online_store,
            repo_path=repo_dir_name,
        )
        fs = FeatureStore(config=config)
        environment = Environment(
            name=project,
            test_repo_config=test_repo_config,
            feature_store=fs,
            data_source=ds,
            data_source_creator=offline_creator,
        )

        fvs = []
        entities = []
        try:
            if create_and_apply:
                entities.extend([driver(), customer()])
                fvs.extend([
                    environment.driver_stats_feature_view(),
                    environment.customer_feature_view(),
                ])
                fs.apply(fvs + entities)

            if materialize:
                fs.materialize(environment.start_date, environment.end_date)

            yield environment
        finally:
            offline_creator.teardown()
            fs.teardown()