def _ingest_request(self): """ Interate through the metrics and create an IngestRequest """ self._update_service_info() request = IngestRequest(reporter=self._reporter) request.idempotency_key = self._generate_idempotency_key() start_time = Timestamp() start_time.GetCurrentTime() duration = Duration() duration.FromSeconds(self._intervals * self._flush_interval) for metric in self._runtime_metrics: metric_type = MetricKind.GAUGE if len(metric) == 3: key, value, metric_type = metric else: key, value = metric request.points.add( duration=duration, start=start_time, labels=self._labels, metric_name=key, double_value=value, kind=metric_type, ) _log.debug("Metrics collected: %s", request) return request
def to_internal_job(self, data_store): # There should never be more than one active lease for a job. If we # have more than one for some reason, just take the first one. # TODO(SotK): Log some information here if there are multiple active # (ie. not completed or cancelled) leases. lease = self.active_leases[0].to_protobuf( ) if self.active_leases else None q_timestamp = Timestamp() if self.queued_timestamp: q_timestamp.FromDatetime(self.queued_timestamp) q_time_duration = Duration() if self.queued_time_duration: q_time_duration.FromSeconds(self.queued_time_duration) ws_timestamp = Timestamp() if self.worker_start_timestamp: ws_timestamp.FromDatetime(self.worker_start_timestamp) wc_timestamp = Timestamp() if self.worker_completed_timestamp: wc_timestamp.FromDatetime(self.worker_completed_timestamp) requirements = {} for req in self.platform_requirements: values = requirements.setdefault(req.key, set()) values.add(req.value) if self.name in data_store.response_cache: result = data_store.response_cache[self.name] elif self.result is not None: result_digest = string_to_digest(self.result) result = data_store.storage.get_message(result_digest, ExecuteResponse) else: result = None return job.Job(self.do_not_cache, string_to_digest(self.action_digest), platform_requirements=requirements, priority=self.priority, name=self.name, operations=[op.to_protobuf() for op in self.operations], cancelled_operations=set(op.name for op in self.operations if op.cancelled), lease=lease, stage=self.stage, cancelled=self.cancelled, queued_timestamp=q_timestamp, queued_time_duration=q_time_duration, worker_start_timestamp=ws_timestamp, worker_completed_timestamp=wc_timestamp, done=all(op.done for op in self.operations) and len(self.operations) > 0, result=result, worker_name=self.active_leases[0].worker_name if self.active_leases else None, n_tries=self.n_tries)
def test_historical_features( feast_client: Client, batch_source: Union[BigQuerySource, FileSource] ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] job = feast_client.get_historical_features(feature_refs, customers_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), )
def bookings_feature_table_with_mapping(spark, client): schema = StructType([ StructField("id", IntegerType()), StructField("datetime", TimestampType()), StructField("created_datetime", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 100, ), ( 8001, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 150, ), ( 8002, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource( event_timestamp_column="datetime", created_timestamp_column="created_datetime", file_format=ParquetFormat(), file_url=file_uri, field_mapping={"id": "driver_id"}, ) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply(feature_table) shutil.rmtree(temp_dir)
def bookings_feature_table(spark, client): schema = StructType([ StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_completed_bookings", IntegerType()), ]) df_data = [ ( 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100, ), ( 8001, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=2), 150, ), ( 8002, datetime(year=2020, month=9, day=2), datetime(year=2020, month=9, day=2), 200, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema, df_data) file_source = FileSource("event_timestamp", "created_timestamp", "parquet", file_uri) features = [Feature("total_completed_bookings", ValueType.INT32)] max_age = Duration() max_age.FromSeconds(86400) feature_table = FeatureTable("bookings", ["driver_id"], features, batch_source=file_source, max_age=max_age) yield client.apply_feature_table(feature_table) shutil.rmtree(temp_dir)
def done(context, message, args): assert Fact._provider is not None assert Fact.config["io"].connected Fact._trace.EndTime.CopyFrom(Fact.now()) # convert timestamp to millis key = int(datetime.now().timestamp() * 1000) Fact._trace.Logs[key] = message Fact._trace.Args.extend(args) # duration of execution calculation and formatting duration = Duration() exec_time = Fact._trace.EndTime.seconds - Fact._trace.StartTime.seconds duration.FromSeconds(exec_time) Fact._trace.ExecutionLatency.CopyFrom(duration) Fact._provider.collect(Fact._trace, context) if "send_on_update" in Fact.config and Fact.config["send_on_update"]: Fact.send("done") return Fact._trace
def test_historical_features( feast_client: Client, tfrecord_feast_client: Client, batch_source: Union[BigQuerySource, FileSource], ): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] # remove microseconds because job.get_start_time() does not contain microseconds job_submission_time = datetime.utcnow().replace(microsecond=0) job = feast_client.get_historical_features(feature_refs, customers_df) assert job.get_start_time() >= job_submission_time assert job.get_start_time() <= job_submission_time + timedelta(hours=1) output_dir = job.get_output_file_uri() # will both be None if not using Azure blob storage account_name, account_key = _get_azure_creds(feast_client) joined_df = read_parquet(output_dir, azure_account_name=account_name, azure_account_key=account_key) expected_joined_df = pd.DataFrame({ "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), ) job = tfrecord_feast_client.get_historical_features( feature_refs, customers_df) job.get_output_file_uri() assert job.get_status() == SparkJobStatus.COMPLETED
def test_historical_features(feast_client: Client, local_staging_path: str): customer_entity = Entity(name="user_id", description="Customer", value_type=ValueType.INT64) feast_client.apply_entity(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=FileSource( "event_timestamp", "created_timestamp", ParquetFormat(), os.path.join(local_staging_path, "transactions"), ), max_age=max_age, ) feast_client.apply_feature_table(transactions_feature_table) retrieval_date = (datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None)) retrieval_outside_max_age_date = retrieval_date + timedelta(1) event_date = retrieval_date - timedelta(2) creation_date = retrieval_date - timedelta(1) customers = [1001, 1002, 1003, 1004, 1005] daily_transactions = [np.random.rand() * 10 for _ in customers] total_transactions = [np.random.rand() * 100 for _ in customers] transactions_df = pd.DataFrame({ "event_timestamp": [event_date for _ in customers], "created_timestamp": [creation_date for _ in customers], "user_id": customers, "daily_transactions": daily_transactions, "total_transactions": total_transactions, }) feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] customer_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, }) job = feast_client.get_historical_features(feature_refs, customer_df) output_dir = job.get_output_file_uri() joined_df = read_parquet(output_dir) expected_joined_df = pd.DataFrame({ "event_timestamp": [retrieval_date for _ in customers] + [retrieval_outside_max_age_date for _ in customers], "user_id": customers + customers, "transactions__daily_transactions": daily_transactions + [None] * len(customers), }) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True), expected_joined_df.sort_values( by=["user_id", "event_timestamp"]).reset_index(drop=True), )