コード例 #1
0
def feast_client(pytestconfig, ingestion_job_jar):
    redis_host, redis_port = pytestconfig.getoption("redis_url").split(":")

    if pytestconfig.getoption("env") == "local":
        return Client(
            core_url=pytestconfig.getoption("core_url"),
            serving_url=pytestconfig.getoption("serving_url"),
            spark_launcher="standalone",
            spark_standalone_master="local",
            spark_home=os.getenv("SPARK_HOME") or os.path.dirname(pyspark.__file__),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=redis_host,
            redis_port=redis_port,
        )

    if pytestconfig.getoption("env") == "gcloud":
        return Client(
            core_url=pytestconfig.getoption("core_url"),
            serving_url=pytestconfig.getoption("serving_url"),
            spark_launcher="dataproc",
            dataproc_cluster_name=pytestconfig.getoption("dataproc_cluster_name"),
            dataproc_project=pytestconfig.getoption("dataproc_project"),
            dataproc_region=pytestconfig.getoption("dataproc_region"),
            dataproc_staging_location=os.path.join(
                pytestconfig.getoption("staging_path"), "dataproc"
            ),
            spark_ingestion_jar=ingestion_job_jar,
        )
コード例 #2
0
def test_restarting_failed_jobs(feature_table):
    """ If configured - restart failed jobs """

    feast_client = FeastClient(
        job_service_pause_between_jobs=0,
        job_service_retry_failed_jobs=True,
        options={"whitelisted_projects": "default,ride"},
    )
    feast_client.list_projects = Mock(return_value=["default"])
    feast_client.list_feature_tables = Mock()

    spark_client = Client(feast_client)
    spark_client.list_jobs = Mock()
    spark_client.start_stream_to_online_ingestion = Mock()

    spark_client.feature_store.list_feature_tables.return_value = [
        feature_table
    ]
    spark_client.list_jobs.return_value = []

    ensure_stream_ingestion_jobs(spark_client, all_projects=True)

    spark_client.list_jobs.assert_called_once_with(include_terminated=False)
    spark_client.start_stream_to_online_ingestion.assert_called_once_with(
        feature_table, [], project="default")
コード例 #3
0
    def __init__(self, name: str,
                 predictor_host: str,
                 feast_serving_url: str,
                 entity_ids: List[str],
                 feature_refs: List[str]):
        """Initialize the model name, predictor host, Feast serving URL,
           entity IDs, and feature references

        Args:
            name (str): Name of the model.
            predictor_host (str): The host in which the predictor runs.
            feast_serving_url (str): The Feast serving URL, in the form
            of <host_name:port>
            entity_ids (List[str]): The entity IDs for which to retrieve
            features from the Feast feature store
            feature_refs (List[str]): The feature references for the
            features to be retrieved
        """
        super().__init__(name)
        self.predictor_host = predictor_host
        self.client = Client(serving_url=feast_serving_url)
        self.entity_ids = entity_ids
        self.feature_refs = feature_refs

        logging.info("Model name = %s", name)
        logging.info("Predictor host = %s", predictor_host)
        logging.info("Feast serving URL = %s", feast_serving_url)
        logging.info("Entity ids = %s", entity_ids)
        logging.info("Feature refs = %s", feature_refs)

        self.timeout = 100
コード例 #4
0
def ingest_and_verify(feast_client: Client, feature_table: FeatureTable,
                      original: pd.DataFrame):
    job = feast_client.start_offline_to_online_ingestion(
        feature_table,
        original.event_timestamp.min().to_pydatetime(),
        original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1),
    )
    assert job.get_feature_table() == feature_table.name

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", f"{feature_table.name}:unique_drivers"]],
        original[["s2id", "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
コード例 #5
0
 def init(self, conf: ConfigTree) -> None:
     conf = conf.with_fallback(FeastExtractor.DEFAULT_CONFIG)
     self._feast_service = conf.get_string(
         FeastExtractor.FEAST_SERVICE_CONFIG_KEY)
     self._describe_feature_tables = conf.get_bool(
         FeastExtractor.DESCRIBE_FEATURE_TABLES)
     self._client = Client(
         core_url=conf.get_string(FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY))
     self._extract_iter: Union[None, Iterator] = None
コード例 #6
0
def feast_client():
    c = FeastClient(
        job_service_pause_between_jobs=0,
        options={"whitelisted_projects": "default,ride"},
    )
    c.list_projects = Mock(return_value=["default", "ride", "invalid_project"])
    c.list_feature_tables = Mock()

    yield c
コード例 #7
0
def ensure_stream_ingestion_jobs(client: feast.Client, all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = client.list_projects() if all_projects else [client.project]

    expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_table_refs.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    for job in client.list_jobs(include_terminated=False):
        if isinstance(job, StreamIngestionJob):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logging.debug(
        f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}"
    )

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        logging.info(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logging.warning(f"Job canceling failed with exception {exc}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, table_name = expected_job_hash_to_table_refs[job_hash]
        logging.info(
            f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}"
        )
        feature_table = client.get_feature_table(name=table_name,
                                                 project=project)
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)
コード例 #8
0
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client,
                                       feast_spark_client: SparkClient):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    feature_table = FeatureTable(
        name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )
    feast_client.apply(entity)
    feast_client.apply(feature_table)

    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "0 0 * * *")
    config.load_incluster_config()
    k8s_api = client.CustomObjectsApi()

    def get_scheduled_spark_application():
        job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}".
                               encode()).hexdigest()
        resource_name = f"feast-{job_hash}"

        return k8s_api.get_namespaced_custom_object(
            group="sparkoperator.k8s.io",
            version="v1beta2",
            namespace=pytestconfig.getoption("k8s_namespace"),
            plural="scheduledsparkapplications",
            name=resource_name,
        )

    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "0 0 * * *"
    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "1 0 * * *")
    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "1 0 * * *"

    feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
コード例 #9
0
ファイル: job_service.py プロジェクト: vingovan/feast
def _get_expected_job_hash_to_table_refs(
        client: feast.Client,
        projects: List[str]) -> Dict[str, Tuple[str, str]]:
    """
    Checks all feature tables for the requires project(s) and determines all required stream
    ingestion jobs from them. Outputs a map of the expected job_hash to a tuple of (project, table_name).

    Args:
        all_projects (bool): If true, runs the check for all project.
            Otherwise only checks the current project.

    Returns:
        Dict[str, Tuple[str, str]]: Map of job_hash -> (project, table_name) for expected stream ingestion jobs
    """
    job_hash_to_table_refs = {}

    for project in projects:
        feature_tables = client.list_feature_tables(project)
        for feature_table in feature_tables:
            if feature_table.stream_source is not None:
                params = get_stream_to_online_ingestion_params(
                    client, project, feature_table, [])
                job_hash = params.get_job_hash()
                job_hash_to_table_refs[job_hash] = (project,
                                                    feature_table.name)

    return job_hash_to_table_refs
コード例 #10
0
def start_job(feast_client: Client, feature_table: FeatureTable, pytestconfig):
    if pytestconfig.getoption("scheduled_streaming_job"):
        return

    job = feast_client.start_stream_to_online_ingestion(feature_table)
    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)
    return job
コード例 #11
0
def test_feature_table_whitelist():
    with tempfile.NamedTemporaryFile() as tmp:
        tmp.writelines([b"project1:table1\n", b"project1:table2"])
        tmp.seek(0)
        feast_client = Client(whitelisted_feature_tables_path=tmp.name)
        job_client = JobClient(feast_client)
        job_servicer = JobServiceServicer(job_client)
        assert not job_servicer.is_feature_table_whitelisted("project2", "table1")
        assert job_servicer.is_feature_table_whitelisted("project1", "table1")
コード例 #12
0
ファイル: client.py プロジェクト: devopstoday11/feast
def feast_client(
    pytestconfig,
    ingestion_job_jar,
    redis_server: RedisExecutor,
    feast_core: Tuple[str, int],
    feast_serving: Tuple[str, int],
    local_staging_path,
):
    if pytestconfig.getoption("env") == "local":
        return Client(
            core_url=f"{feast_core[0]}:{feast_core[1]}",
            serving_url=f"{feast_serving[0]}:{feast_serving[1]}",
            spark_launcher="standalone",
            spark_standalone_master="local",
            spark_home=os.getenv("SPARK_HOME")
            or os.path.dirname(pyspark.__file__),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=redis_server.host,
            redis_port=redis_server.port,
            spark_staging_location=os.path.join(local_staging_path, "spark"),
            historical_feature_output_location=os.path.join(
                local_staging_path, "historical_output"),
        )

    if pytestconfig.getoption("env") == "gcloud":
        return Client(
            core_url=f"{feast_core[0]}:{feast_core[1]}",
            serving_url=f"{feast_serving[0]}:{feast_serving[1]}",
            spark_launcher="dataproc",
            dataproc_cluster_name=pytestconfig.getoption(
                "dataproc_cluster_name"),
            dataproc_project=pytestconfig.getoption("dataproc_project"),
            dataproc_region=pytestconfig.getoption("dataproc_region"),
            spark_staging_location=os.path.join(local_staging_path,
                                                "dataproc"),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=pytestconfig.getoption("redis_url").split(":")[0],
            redis_port=pytestconfig.getoption("redis_url").split(":")[1],
            historical_feature_output_location=os.path.join(
                local_staging_path, "historical_output"),
        )
コード例 #13
0
def test_offline_ingestion(feast_client: Client, staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            "event_timestamp",
            "event_timestamp",
            ParquetFormat(),
            os.path.join(staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    status = wait_retry_backoff(
        lambda:
        (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS),
        300)

    assert status == SparkJobStatus.COMPLETED

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
コード例 #14
0
ファイル: test_telemetry.py プロジェクト: szalai1/feast
def test_telemetry_off_v09(mocker):
    old_environ = dict(os.environ)
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_TELEMETRY"] = "False"

    test_client = Client(serving_url=None, core_url=None, telemetry=False)
    test_client.set_project("project1")
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    mocker.patch.object(
        test_client, "_apply_entity", return_value=None,
    )

    test_client.apply(entity)

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_telemetry_id(test_telemetry_id)
    assert rows.total_rows == 0
コード例 #15
0
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset,
                                        feast_client: Client,
                                        feast_spark_client: SparkClient):
    original = generate_data()
    bq_project = pytestconfig.getoption("bq_project")

    bq_client = bigquery.Client(project=bq_project)
    source_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}",
    )
    bq_client.load_table_from_dataframe(original, source_ref).result()

    view_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}",
    )
    view = bigquery.Table(view_ref)
    view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`"
    bq_client.create_table(view)

    entity = Entity(name="s2id",
                    description="S2id",
                    value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name="bq_ingestion",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=BigQuerySource(
            event_timestamp_column="event_timestamp",
            table_ref=
            f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}",
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
コード例 #16
0
ファイル: test_telemetry.py プロジェクト: szalai1/feast
def test_telemetry_on_v09(mocker):
    # Setup environment
    old_environ = dict(os.environ)
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    test_client = Client(serving_url=None, core_url=None, telemetry=True)
    test_client.set_project("project1")
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    mocker.patch.object(
        test_client, "_apply_entity", return_value=None,
    )

    test_client.apply(entity)

    os.environ.clear()
    os.environ.update(old_environ)

    ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
コード例 #17
0
def test_offline_ingestion(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
コード例 #18
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
コード例 #19
0
    def get_conn(self):
        """
        Initialize a Feast client.
        """
        if self.client:
            return self.client

        self.connection = self.get_connection(self.conn_id)
        self.extras = self.connection.extra_dejson
        self.client = Client(core_url=self.extras["core_url"],
                             serving_url=self.extras.get("serving_url"),
                             project=self.extras.get("project"))

        return self.client
コード例 #20
0
def client_with_local_spark(tmpdir):
    import pyspark

    spark_staging_location = f"file://{os.path.join(tmpdir, 'staging')}"
    historical_feature_output_location = (
        f"file://{os.path.join(tmpdir, 'historical_feature_retrieval_output')}"
    )

    return Client(
        core_url=f"localhost:{free_port}",
        spark_launcher="standalone",
        spark_standalone_master="local",
        spark_home=os.path.dirname(pyspark.__file__),
        spark_staging_location=spark_staging_location,
        historical_feature_output_location=historical_feature_output_location,
        historical_feature_output_format="parquet",
    )
コード例 #21
0
def test_historical_features(
    feast_client: Client, batch_source: Union[BigQuerySource, FileSource]
):
    customer_entity = Entity(
        name="user_id", description="Customer", value_type=ValueType.INT64
    )
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    job = feast_client.get_historical_features(feature_refs, customers_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame(
        {
            "event_timestamp": customers_df.event_timestamp.tolist(),
            "user_id": customers_df.user_id.tolist(),
            "transactions__daily_transactions": transactions_df.daily_transactions.tolist()
            + [None] * transactions_df.shape[0],
        }
    )

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True),
        expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True
        ),
    )
コード例 #22
0
def test_list_jobs_long_table_name(feast_client: Client,
                                   batch_source: Union[BigQuerySource,
                                                       FileSource]):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name=
        "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data()
    feast_client.ingest(feature_table, data_sample)

    job = feast_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id()
        for job in feast_client.list_jobs(include_terminated=True,
                                          table_name=feature_table.name)
    ]
    assert job.get_id() in all_job_ids
コード例 #23
0
ファイル: client.py プロジェクト: polymath-is/feast
def feast_client(
    pytestconfig,
    ingestion_job_jar,
    redis_server: RedisExecutor,
    feast_core: Tuple[str, int],
    feast_serving: Tuple[str, int],
    local_staging_path,
    feast_jobservice: Optional[Tuple[str, int]],
    enable_auth,
):
    if feast_jobservice is None:
        job_service_env = dict()
    else:
        job_service_env = dict(
            job_service_url=f"{feast_jobservice[0]}:{feast_jobservice[1]}")

    if pytestconfig.getoption("env") == "local":
        import pyspark

        return Client(
            core_url=f"{feast_core[0]}:{feast_core[1]}",
            serving_url=f"{feast_serving[0]}:{feast_serving[1]}",
            spark_launcher="standalone",
            spark_standalone_master="local",
            spark_home=os.getenv("SPARK_HOME")
            or os.path.dirname(pyspark.__file__),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=redis_server.host,
            redis_port=redis_server.port,
            spark_staging_location=os.path.join(local_staging_path, "spark"),
            historical_feature_output_location=os.path.join(
                local_staging_path, "historical_output"),
            ingestion_drop_invalid_rows=True,
            **job_service_env,
        )

    elif pytestconfig.getoption("env") == "gcloud":
        c = Client(
            core_url=f"{feast_core[0]}:{feast_core[1]}",
            serving_url=f"{feast_serving[0]}:{feast_serving[1]}",
            spark_launcher="dataproc",
            dataproc_cluster_name=pytestconfig.getoption(
                "dataproc_cluster_name"),
            dataproc_project=pytestconfig.getoption("dataproc_project"),
            dataproc_region=pytestconfig.getoption("dataproc_region"),
            spark_staging_location=os.path.join(local_staging_path,
                                                "dataproc"),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=pytestconfig.getoption("redis_url").split(":")[0],
            redis_port=pytestconfig.getoption("redis_url").split(":")[1],
            historical_feature_output_location=os.path.join(
                local_staging_path, "historical_output"),
            ingestion_drop_invalid_rows=True,
            grpc_connection_timeout=30,
            **job_service_env,
        )
    elif pytestconfig.getoption("env") == "aws":
        return Client(
            core_url=f"{feast_core[0]}:{feast_core[1]}",
            serving_url=f"{feast_serving[0]}:{feast_serving[1]}",
            spark_launcher="emr",
            emr_cluster_id=pytestconfig.getoption("emr_cluster_id"),
            emr_region=pytestconfig.getoption("emr_region"),
            spark_staging_location=os.path.join(local_staging_path, "emr"),
            emr_log_location=os.path.join(local_staging_path, "emr_logs"),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=pytestconfig.getoption("redis_url").split(":")[0],
            redis_port=pytestconfig.getoption("redis_url").split(":")[1],
            historical_feature_output_location=os.path.join(
                local_staging_path, "historical_output"),
            ingestion_drop_invalid_rows=True,
        )
    elif pytestconfig.getoption("env") == "k8s":
        return Client(
            core_url=f"{feast_core[0]}:{feast_core[1]}",
            serving_url=f"{feast_serving[0]}:{feast_serving[1]}",
            spark_launcher="k8s",
            spark_staging_location=os.path.join(local_staging_path, "k8s"),
            spark_ingestion_jar=ingestion_job_jar,
            redis_host=pytestconfig.getoption("redis_url").split(":")[0],
            redis_port=pytestconfig.getoption("redis_url").split(":")[1],
            historical_feature_output_location=os.path.join(
                local_staging_path, "historical_output"),
        )
    else:
        raise KeyError(f"Unknown environment {pytestconfig.getoption('env')}")

    c.set_project(pytestconfig.getoption("feast_project"))
    return c
コード例 #24
0
def test_historical_features(
    feast_client: Client,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(output_dir,
                             azure_account_name=account_name,
                             azure_account_key=account_key)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )

    job = tfrecord_feast_client.get_historical_features(
        feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED
コード例 #25
0
ファイル: make_tests.py プロジェクト: hsheth2/datahub
    from feast import Client  # type: ignore
    from feast.data_format import ParquetFormat
    from feast.data_source import FileSource  # type: ignore
    from feast.entity import Entity
    from feast.feature import Feature
    from feast.feature_table import FeatureTable  # type: ignore
    from feast.value_type import ValueType


if __name__ == "__main__":
    if feast.__version__ > FEAST_MIN_VERSION:
        raise Exception(
            f"this code does not work with feast > {FEAST_MIN_VERSION}. Found {feast.__version__}"
        )

    test_client = Client(core_url="testfeast:6565")

    # create dummy entity since Feast demands it
    entity_1 = Entity(
        name="dummy_entity_1",
        description="Dummy entity 1",
        value_type=ValueType.STRING,
        labels={"key": "val"},
    )

    # create dummy entity since Feast demands it
    entity_2 = Entity(
        name="dummy_entity_2",
        description="Dummy entity 2",
        value_type=ValueType.INT32,
        labels={"key": "val"},
コード例 #26
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            "event_timestamp",
            "event_timestamp",
            kafka_broker,
            AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60)

    try:
        original = generate_data()[[
            "s2id", "unique_drivers", "event_timestamp"
        ]]
        for record in original.to_dict("records"):
            record["event_timestamp"] = (
                record["event_timestamp"].to_pydatetime().replace(
                    tzinfo=pytz.utc))

            send_avro_record_to_kafka(
                topic_name,
                record,
                bootstrap_servers=kafka_broker,
                avro_schema_json=avro_schema(),
            )

        def get_online_features():
            features = feast_client.get_online_features(
                ["drivers_stream:unique_drivers"],
                entity_rows=[{
                    "s2id": s2_id
                } for s2_id in original["s2id"].tolist()],
            ).to_dict()
            df = pd.DataFrame.from_dict(features)
            return df, not df["drivers_stream:unique_drivers"].isna().any()

        ingested = wait_retry_backoff(get_online_features, 60)
    finally:
        job.cancel()

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
コード例 #27
0
ファイル: common.py プロジェクト: tianshizz/feast
def stop_job(job, feast_client: Client, feature_table: FeatureTable):
    if job:
        job.cancel()
    else:
        feast_client.delete_feature_table(feature_table.name)
コード例 #28
0
ファイル: ingest.py プロジェクト: hsheth2/datahub
def cli(core_url, output_path):

    client = Client(core_url=core_url)

    tables = client.list_feature_tables()

    # sort tables by name for consistent outputs
    tables = sorted(tables, key=lambda x: x.name)

    parsed_tables = []

    for table in tables:

        # sort entities by name for consistent outputs
        entities = sorted(table.entities)

        batch_source = None
        stream_source = None

        # platform and name for constructing URN later on
        batch_source_platform = "unknown"
        stream_source_platform = "unknown"
        batch_source_name = "unknown"
        stream_source_name = "unknown"

        if isinstance(table.batch_source, BigQuerySource):
            batch_source = "BigQuerySource"
            batch_source_platform = "bigquery"
            batch_source_name = table.batch_source.bigquery_options.table_ref

        if isinstance(table.batch_source, FileSource):
            batch_source = "FileSource"
            batch_source_platform = "file"

            # replace slashes because the react frontend can't parse them correctly
            batch_source_name = table.batch_source.file_options.file_url.replace(
                "/", "."
            )

            # replace redundant file prefix
            if batch_source_name.startswith("file:.."):
                batch_source_name = batch_source_name[7:]

        if isinstance(table.stream_source, KafkaSource):
            stream_source = "KafkaSource"
            stream_source_platform = "kafka"
            stream_source_name = table.stream_source.kafka_options.topic

        if isinstance(table.stream_source, KinesisSource):
            stream_source = "KinesisSource"
            stream_source_platform = "kinesis"
            stream_source_name = f"{table.stream_source.kinesis_options.region}-{table.stream_source.kinesis_options.stream_name}"

        # currently unused in MCE outputs, but useful for debugging
        stream_source_config = table.to_dict()["spec"].get("streamSource")
        batch_source_config = table.to_dict()["spec"]["batchSource"]

        raw_entities = [
            client.get_entity(entity_name) for entity_name in table.entities
        ]
        raw_entities = sorted(raw_entities, key=lambda x: x.name)

        source_info = {
            "batch_source": batch_source,
            "stream_source": stream_source,
            "batch_source_config": batch_source_config,
            "stream_source_config": stream_source_config,
            "batch_source_platform": batch_source_platform,
            "stream_source_platform": stream_source_platform,
            "batch_source_name": batch_source_name,
            "stream_source_name": stream_source_name,
        }

        # sort entities by name for consistent outputs
        entities = sorted(
            [
                {
                    "name": x.name,
                    "type": x.value_type.name,
                    "description": x.description,
                    **source_info,
                }
                for x in raw_entities
            ],
            key=lambda x: x["name"],
        )

        # sort features by name for consistent outputs
        features = sorted(
            [
                {"name": x.name, "type": x.dtype.name, **source_info}
                for x in table.features
            ],
            key=lambda x: x["name"],
        )

        parsed_tables.append(
            {
                "name": table.name,
                "entities": entities,
                "features": features,
            }
        )

    if output_path is not None:

        with open(output_path, "w") as f:
            json.dump(parsed_tables, f)

    else:

        print(parsed_tables)
コード例 #29
0
        Feature(name="int64_feature", dtype=ValueType.INT64),
        Feature(name="int32_feature", dtype=ValueType.INT32),
        Feature(name="string_feature", dtype=ValueType.STRING),
        Feature(name="bytes_feature", dtype=ValueType.BYTES),
        Feature(name="bool_feature", dtype=ValueType.BOOL),
        Feature(name="double_feature", dtype=ValueType.DOUBLE),
        Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST),
        Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST),
        Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST),
        Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST),
        Feature(name="string_list_feature", dtype=ValueType.STRING_LIST),
        Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST),
    ],
)

client = Client(core_url=feast_core_url, serving_url=feast_online_serving_url)

# Register feature set
client.apply(all_types_fs_expected)

df.info()
df.describe()
df.head()

# Ingest tdata
client.ingest(all_types_fs_expected, df)


# Wait for data to be available
def try_get_features():
    online_request_entity = [{"user_id": 1001}]
コード例 #30
0
def test_historical_features(feast_client: Client, local_staging_path: str):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=FileSource(
            "event_timestamp",
            "created_timestamp",
            ParquetFormat(),
            os.path.join(local_staging_path, "transactions"),
        ),
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    retrieval_date = (datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None))
    retrieval_outside_max_age_date = retrieval_date + timedelta(1)
    event_date = retrieval_date - timedelta(2)
    creation_date = retrieval_date - timedelta(1)

    customers = [1001, 1002, 1003, 1004, 1005]
    daily_transactions = [np.random.rand() * 10 for _ in customers]
    total_transactions = [np.random.rand() * 100 for _ in customers]

    transactions_df = pd.DataFrame({
        "event_timestamp": [event_date for _ in customers],
        "created_timestamp": [creation_date for _ in customers],
        "user_id":
        customers,
        "daily_transactions":
        daily_transactions,
        "total_transactions":
        total_transactions,
    })

    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    customer_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
    })

    job = feast_client.get_historical_features(feature_refs, customer_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
        "transactions__daily_transactions":
        daily_transactions + [None] * len(customers),
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )