def __init__( self, id, name, featurestore_id, description=None, # members specific to type of connector bootstrap_servers=None, security_protocol=None, ssl_truststore_location=None, ssl_truststore_password=None, ssl_keystore_location=None, ssl_keystore_password=None, ssl_key_password=None, ssl_endpoint_identification_algorithm=None, options=None, ): super().__init__(id, name, description, featurestore_id) # KAFKA self._bootstrap_servers = bootstrap_servers self._security_protocol = security_protocol self._ssl_truststore_location = engine.get_instance().add_file( ssl_truststore_location) self._ssl_truststore_password = ssl_truststore_password self._ssl_keystore_location = engine.get_instance().add_file( ssl_keystore_location) self._ssl_keystore_password = ssl_keystore_password self._ssl_key_password = ssl_key_password self._ssl_endpoint_identification_algorithm = ( ssl_endpoint_identification_algorithm) self._options = ( {option["name"]: option["value"] for option in options} if options is not None else {})
def save(self, features, write_options={}): # TODO: Decide if we want to have potentially dangerous defaults like {} if isinstance(features, query.Query): feature_dataframe = features.read("offline") self._querydto = features else: feature_dataframe = engine.get_instance( ).convert_to_default_dataframe(features) self._features = engine.get_instance( ).parse_schema_training_dataset(feature_dataframe) user_version = self._version user_stats_config = self._statistics_config self._training_dataset_engine.save(self, feature_dataframe, write_options) # currently we do not save the training dataset statistics config for training datasets self.statistics_config = user_stats_config if self.statistics_config.enabled: self._statistics_engine.compute_statistics(self, feature_dataframe) if user_version is None: warnings.warn( "No version provided for creating training dataset `{}`, incremented version to `{}`." .format(self._name, self._version), util.VersionWarning, ) return self
def insert(self, feature_group, feature_dataframe, overwrite, storage, write_options): offline_write_options = write_options online_write_options = write_options if storage.lower() == "online" or storage.lower() == "all": # Add JDBC connection configuration in case of online feature group online_conn = self._storage_connector_api.get_online_connector() jdbc_options = online_conn.spark_options() jdbc_options["dbtable"] = self._get_online_table_name( feature_group) online_write_options = {**jdbc_options, **online_write_options} if (storage.lower() == "offline" or storage.lower() == "all") and overwrite: self._feature_group_api.delete_content(feature_group) engine.get_instance().save_dataframe( self._get_table_name(feature_group), feature_group.partition_key, feature_dataframe, self.APPEND, storage, offline_write_options, online_write_options, )
def insert(self, training_dataset, dataset, user_write_options, overwrite): engine.get_instance().write_training_dataset( training_dataset, dataset, user_write_options, self.OVERWRITE if overwrite else self.APPEND, )
def insert( self, feature_group, feature_dataframe, overwrite, operation, storage, write_options, ): validation_id = None if feature_group.validation_type != "NONE": validation = feature_group.validate(feature_dataframe) validation_id = validation.validation_id offline_write_options = write_options online_write_options = self.get_kafka_config(write_options) if not feature_group.online_enabled and storage == "online": raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group.") if overwrite: self._feature_group_api.delete_content(feature_group) engine.get_instance().save_dataframe( feature_group, feature_dataframe, "bulk_insert" if overwrite else operation, feature_group.online_enabled, storage, offline_write_options, online_write_options, validation_id, )
def save( self, features: Union[ query.Query, pd.DataFrame, TypeVar("pyspark.sql.DataFrame"), # noqa: F821 TypeVar("pyspark.RDD"), # noqa: F821 np.ndarray, List[list], ], write_options: Optional[Dict[Any, Any]] = {}, ): """Materialize the training dataset to storage. This method materializes the training dataset either from a Feature Store `Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python lists or Numpy ndarrays. # Arguments features: Feature data to be materialized. write_options: Additional write options as key/value pairs. Defaults to `{}`. # Returns `TrainingDataset`: The updated training dataset metadata object, the previous `TrainingDataset` object on which you call `save` is also updated. # Raises `RestAPIError`: Unable to create training dataset metadata. """ if isinstance(features, query.Query): feature_dataframe = features.read() self._querydto = features else: feature_dataframe = engine.get_instance().convert_to_default_dataframe( features ) self._features = engine.get_instance().parse_schema_training_dataset( feature_dataframe ) self._set_label_features() user_version = self._version user_stats_config = self._statistics_config self._training_dataset_engine.save(self, feature_dataframe, write_options) # currently we do not save the training dataset statistics config for training datasets self.statistics_config = user_stats_config if self.statistics_config.enabled: self._statistics_engine.compute_statistics(self, feature_dataframe) if user_version is None: warnings.warn( "No version provided for creating training dataset `{}`, incremented version to `{}`.".format( self._name, self._version ), util.VersionWarning, ) return self
def compute_statistics(self, metadata_instance, feature_dataframe=None): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark": # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Hive engine # and calling compute_statistics feature_dataframe = (feature_dataframe if feature_dataframe else metadata_instance.read()) commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S") if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_str, content_str) self._statistics_api.post(metadata_instance, stats) return stats else: # Hive engine engine.get_instance().profile(metadata_instance)
def get_training_data( self, feature_view_obj, read_options=None, splits=[], training_dataset_obj=None, training_dataset_version=None, ): # check if provided td version has already existed. if training_dataset_version: td_updated = self._get_training_data_metadata( feature_view_obj, training_dataset_version) else: td_updated = self._create_training_data_metadata( feature_view_obj, training_dataset_obj) # check splits if len(splits) != len(td_updated.splits): if len(td_updated.splits) == 0: method_name = "get_training_data" elif len(td_updated.splits) == 2: method_name = "get_train_test_split" elif len(td_updated.splits) == 3: method_name = "get_train_validation_test_split" raise ValueError( f"Incorrect `get` method is used. Use `feature_view.{method_name}` instead." ) read_options = engine.get_instance().read_options( td_updated.data_format, read_options) if td_updated.training_dataset_type != td_updated.IN_MEMORY: split_df = self._read_from_storage_connector( td_updated, td_updated.splits, read_options) else: self._check_feature_group_accessibility(feature_view_obj) query = self.get_batch_query( feature_view_obj, start_time=td_updated.event_start_time, end_time=td_updated.event_end_time, with_label=True, ) split_df = engine.get_instance().get_training_data( td_updated, feature_view_obj, query, read_options) self.compute_training_dataset_statistics(feature_view_obj, td_updated, split_df, calc_stat=True) # split df into features and labels df if td_updated.splits: for split in td_updated.splits: split_name = split.name split_df[split_name] = engine.get_instance().split_labels( split_df[split_name], feature_view_obj.labels) else: split_df = engine.get_instance().split_labels( split_df, feature_view_obj.labels) return td_updated, split_df
def show(self, n): """Show the first n rows of the feature group.""" engine.get_instance().set_job_group( "Fetching Feature group", "Getting feature group: {} from the featurestore {}".format( self._name, self._feature_store_name), ) return self.select_all().show(n)
def read(self, dataframe_type="default"): """Get the feature group as a DataFrame.""" engine.get_instance().set_job_group( "Fetching Feature group", "Getting feature group: {} from the featurestore {}".format( self._name, self._feature_store_name), ) return self.select_all().read(dataframe_type=dataframe_type)
def register_on_demand(self): if self._on_demand_fg_aliases is None: return for on_demand_fg_alias in self._on_demand_fg_aliases: engine.get_instance().register_on_demand_temporary_table( on_demand_fg_alias.on_demand_feature_group, on_demand_fg_alias.alias, )
def commit_delete(feature_group, delete_df, write_options): hudi_engine_instance = hudi_engine.HudiEngine( feature_group.feature_store_id, feature_group.feature_store_name, feature_group, engine.get_instance()._spark_context, engine.get_instance()._spark_session, ) return hudi_engine_instance.delete_record(delete_df, write_options)
def save(self, feature_group): if len(feature_group.features) == 0: # If the user didn't specify the schema, parse it from the query on_demand_dataset = ( engine.get_instance().register_on_demand_temporary_table( feature_group, "read_ondmd")) feature_group._features = engine.get_instance( ).parse_schema_feature_group(on_demand_dataset) self._feature_group_api.save(feature_group)
def _register_on_demand(self, on_demand_fg_aliases): if on_demand_fg_aliases is None: return for on_demand_fg_alias in on_demand_fg_aliases: engine.get_instance().register_on_demand_temporary_table( on_demand_fg_alias.on_demand_feature_group.query, on_demand_fg_alias.on_demand_feature_group.storage_connector, on_demand_fg_alias.alias, )
def save(self, feature_group): if len(feature_group.features) == 0: # If the user didn't specify the schema, parse it from the query on_demand_dataset = engine.get_instance().sql( feature_group.query, None, feature_group.storage_connector, "default" ) feature_group._features = engine.get_instance().parse_schema_feature_group( on_demand_dataset ) self._feature_group_api.save(feature_group)
def read(self, training_dataset, split, user_read_options): read_options = engine.get_instance().read_options( training_dataset.data_format, user_read_options ) return engine.get_instance().read( training_dataset.storage_connector, training_dataset.data_format, read_options, training_dataset.location, split, )
def save(self, feature_group, feature_dataframe, write_options): if len(feature_group.features) == 0: # User didn't provide a schema. extract it from the dataframe feature_group._features = engine.get_instance( ).parse_schema_feature_group(feature_dataframe) # set primary and partition key columns # we should move this to the backend for feat in feature_group.features: if feat.name in feature_group.primary_key: feat.primary = True if feat.name in feature_group.partition_key: feat.partition = True if (feature_group.hudi_precombine_key is not None and feat.name == feature_group.hudi_precombine_key): feat.hudi_precombine_key = True self._feature_group_api.save(feature_group) validation_id = None if feature_group.validation_type != "NONE": validation = feature_group.validate(feature_dataframe) validation_id = validation.validation_id offline_write_options = write_options online_write_options = write_options table_name = self._get_table_name(feature_group) if feature_group.online_enabled: # Add JDBC connection configuration in case of online feature group online_conn = self._storage_connector_api.get_online_connector() jdbc_options = online_conn.spark_options() jdbc_options["dbtable"] = self._get_online_table_name( feature_group) online_write_options = {**jdbc_options, **online_write_options} engine.get_instance().save_dataframe( table_name, feature_group, feature_dataframe, self.APPEND, hudi_engine.HudiEngine.HUDI_BULK_INSERT if feature_group.time_travel_format == "HUDI" else None, feature_group.online_enabled, None, offline_write_options, online_write_options, validation_id, )
def append_features(self, feature_group, new_features): """Appends features to a feature group.""" # first get empty dataframe of current version and append new feature # necessary to write empty df to the table in order for the parquet schema # which is used by hudi to be updated df = engine.get_instance().get_empty_appended_dataframe( feature_group.read(), new_features) self._update_features_metadata(feature_group, feature_group.features + new_features) # write empty dataframe to update parquet schema engine.get_instance().save_empty_dataframe(feature_group, df)
def show(self, n: int, online: Optional[bool] = False): """Show the first `n` rows of the feature group. # Arguments n: int. Number of rows to show. online: bool, optional. If `True` read from online feature store, defaults to `False`. """ engine.get_instance().set_job_group( "Fetching Feature group", "Getting feature group: {} from the featurestore {}".format( self._name, self._feature_store_name), ) return self.select_all().show(n, online)
def insert(self, training_dataset, feature_dataframe, user_write_options, overwrite): # validate matching schema engine.get_instance().schema_matches(feature_dataframe, training_dataset.schema) write_options = engine.get_instance().write_options( training_dataset.data_format, user_write_options) self._write( training_dataset, feature_dataframe, write_options, self.OVERWRITE if overwrite else self.APPEND, )
def read(self, training_dataset, split, user_read_options): if split is None: path = training_dataset.location + "/" + "**" else: path = training_dataset.location + "/" + str(split) read_options = engine.get_instance().read_options( training_dataset.data_format, user_read_options) return engine.get_instance().read( training_dataset.storage_connector, training_dataset.data_format, read_options, path, )
def read( self, online: Optional[bool] = False, dataframe_type: Optional[str] = "default", read_options: Optional[dict] = {}, ): """Read the specified query into a DataFrame. It is possible to specify the storage (online/offline) to read from and the type of the output DataFrame (Spark, Pandas, Numpy, Python Lists). # Arguments online: Read from online storage. Defaults to `False`. dataframe_type: DataFrame type to return. Defaults to `"default"`. read_options: Optional dictionary with read options for Spark. Defaults to `{}`. # Returns `DataFrame`: DataFrame depending on the chosen type. """ sql_query, online_conn = self._prep_read(online, read_options) return engine.get_instance().sql( sql_query, self._feature_store_name, online_conn, dataframe_type, read_options, )
def read( self, query: str = None, data_format: str = None, options: dict = {}, path: str = None, ): """Reads results from BigQuery into a spark dataframe using the storage connector. Reading from bigquery is done via either specifying the BigQuery table or BigQuery query. For example, to read from a BigQuery table, set the BigQuery project, dataset and table on storage connector and read directly from the corresponding path. ```python conn.read() ``` OR, to read results from a BigQuery query, set `Materialization Dataset` on storage connector, and pass your SQL to `query` argument. ```python conn.read(query='SQL') ``` Optionally, passing `query` argument will take priority at runtime if the table options were also set on the storage connector. This allows user to run from both a query or table with same connector, assuming all fields were set. Also, user can set the `path` argument to a bigquery table path to read at runtime, if table options were not set initially while creating the connector. ```python conn.read(path='project.dataset.table') ``` # Arguments query: BigQuery query. Defaults to `None`. data_format: Spark data format. Defaults to `None`. options: Spark options. Defaults to `None`. path: BigQuery table path. Defaults to `None`. # Raises `ValueError`: Malformed arguments. # Returns `Dataframe`: A Spark dataframe. """ # merge user spark options on top of default spark options options = ({ **self.spark_options(), **options } if options is not None else self.spark_options()) if query: path = query elif self._query_table: path = self._query_table elif path: pass else: raise ValueError( "Either query should be provided " "or Query Project,Dataset and Table should be set") return engine.get_instance().read(self, self.BIGQUERY_FORMAT, options, path)
def read( self, query: str = None, data_format: str = None, options: dict = {}, path: str = None, ): """Reads GCS path into a dataframe using the storage connector. ```python conn.read(data_format='spark_formats',path='gs://BUCKET/DATA') ``` # Arguments data_format: Spark data format. Defaults to `None`. options: Spark options. Defaults to `None`. path: GCS path. Defaults to `None`. # Raises `ValueError`: Malformed arguments. # Returns `Dataframe`: A Spark dataframe. """ return engine.get_instance().read(self, data_format, options, path)
def insert_stream( self, feature_group, dataframe, query_name, output_mode, await_termination, timeout, write_options, ): if not feature_group.online_enabled: raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group. " "It is currently only possible to stream to the online storage." ) if feature_group.validation_type != "NONE": warnings.warn( "Stream ingestion for feature group `{}`, with version `{}` will not perform validation." .format(feature_group.name, feature_group.version), util.ValidationWarning, ) return engine.get_instance().save_stream_dataframe( feature_group, dataframe, query_name, output_mode, await_termination, timeout, self.get_kafka_config(write_options), )
def sql(self, query, feature_store_name, dataframe_type, online): if online: online_conn = self._storage_connector_api.get_online_connector() else: online_conn = None return engine.get_instance().sql(query, feature_store_name, online_conn, dataframe_type)
def save(self, feature_group, feature_dataframe, write_options, validation_options): self._save_feature_group_metadata(feature_group, feature_dataframe, write_options) # deequ validation only on spark validation = feature_group._data_validation_engine.ingest_validate( feature_group, feature_dataframe) validation_id = validation.validation_id if validation is not None else None # ge validation on python and non stream feature groups on spark ge_report = feature_group._great_expectation_engine.validate( feature_group, feature_dataframe, True, validation_options) if ge_report is not None and ge_report.ingestion_result == "REJECTED": return None, ge_report offline_write_options = write_options online_write_options = self.get_kafka_config(write_options) return ( engine.get_instance().save_dataframe( feature_group, feature_dataframe, hudi_engine.HudiEngine.HUDI_BULK_INSERT if feature_group.time_travel_format == "HUDI" else None, feature_group.online_enabled, None, offline_write_options, online_write_options, validation_id, ), ge_report, )
def _save_feature_group_metadata(self, feature_group, feature_dataframe, write_options): # this means FG doesn't exist and should create the new one if len(feature_group.features) == 0: # User didn't provide a schema. extract it from the dataframe feature_group._features = engine.get_instance( ).parse_schema_feature_group(feature_dataframe) # set primary and partition key columns # we should move this to the backend for feat in feature_group.features: if feat.name in feature_group.primary_key: feat.primary = True if feat.name in feature_group.partition_key: feat.partition = True if (feature_group.hudi_precombine_key is not None and feat.name == feature_group.hudi_precombine_key): feat.hudi_precombine_key = True if feature_group.stream: feature_group._options = write_options self._feature_group_api.save(feature_group) print("Feature Group created successfully, explore it at \n" + self._get_feature_group_url(feature_group))
def save(self, feature_group): if len(feature_group.features) == 0: # If the user didn't specify the schema, parse it from the query on_demand_dataset = ( engine.get_instance().register_on_demand_temporary_table( feature_group, "read_ondmd")) feature_group._features = engine.get_instance( ).parse_schema_feature_group(on_demand_dataset) # set primary and partition key columns # we should move this to the backend for feat in feature_group.features: if feat.name in feature_group.primary_key: feat.primary = True self._feature_group_api.save(feature_group)
def show(self, n: int, online: Optional[bool] = False): """Show the first N rows of the Query. # Arguments n: Number of rows to show. online: Show from online storage. Defaults to `False`. """ query = self._query_constructor_api.construct_query(self) if online: sql_query = query.query_online online_conn = self._storage_connector_api.get_online_connector() else: sql_query = query.query online_conn = None # Register on demand feature groups as temporary tables self._register_on_demand(query.on_demand_fg_aliases) # Register on hudi feature groups as temporary tables self._register_hudi_tables( query.hudi_cached_feature_groups, self._feature_store_id, self._feature_store_name, {}, ) return engine.get_instance().show(sql_query, self._feature_store_name, n, online_conn)