def validate(self, feature_group, dataframe, save_report, validation_options): suite = feature_group.get_expectation_suite(False) if suite is not None: run_validation = validation_options.get( "run_validation", suite.run_validation ) if ( run_validation and engine.get_type() == "python" and feature_group.stream ) or ( run_validation and engine.get_type() == "spark" and not feature_group.stream ): report = engine.get_instance().validate_with_great_expectations( dataframe=dataframe, expectation_suite=suite.to_ge_type(), ge_validate_kwargs=validation_options.get("ge_validate_kwargs", {}), ) save_report = validation_options.get("save_report", save_report) if save_report: return feature_group.save_validation_report(report, ge_type=False) return validation_report.ValidationReport(**report.to_json_dict()) return
def compute_training_dataset( self, feature_view_obj, user_write_options, training_dataset_obj=None, training_dataset_version=None, ): if training_dataset_obj: pass elif training_dataset_version: training_dataset_obj = self._get_training_data_metadata( feature_view_obj, training_dataset_version) else: raise ValueError( "No training dataset object or version is provided") batch_query = self.get_batch_query( feature_view_obj, training_dataset_obj.event_start_time, training_dataset_obj.event_end_time, with_label=True, ) td_job = engine.get_instance().write_training_dataset( training_dataset_obj, batch_query, user_write_options, self._OVERWRITE, feature_view_obj=feature_view_obj, ) self._td_code_engine.save_code(training_dataset_obj) if engine.get_type() == "spark": if training_dataset_obj.splits: td_df = dict([( split.name, self._training_dataset_engine.read(training_dataset_obj, split.name, {}), ) for split in training_dataset_obj.splits]) else: td_df = self._training_dataset_engine.read( training_dataset_obj, None, {}) else: td_df = None # currently we do not save the training dataset statistics config for training datasets self.compute_training_dataset_statistics( feature_view_obj, training_dataset_obj, td_df, calc_stat=engine.get_type() == "spark", ) return td_job
def ingest_validate(self, feature_group, feature_dataframe): if feature_group.validation_type != "NONE" and engine.get_type( ) == "spark": # If the engine is Python, the validation will be executed by # the Hopsworks job ingesting the data return self.validate(feature_group, feature_dataframe, True) return None
def profile_transformation_fn_statistics(self, feature_dataframe, columns, label_encoder_features): if (engine.get_type() == "spark" and len(feature_dataframe.select(*columns).head(1)) == 0) or ( (engine.get_type() == "hive" or engine.get_type() == "python") and len(feature_dataframe.head()) == 0): raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile(feature_dataframe, columns, False, True, False) # add unique value profile to String type columns return self.profile_unique_values(feature_dataframe, label_encoder_features, content_str)
def compute_statistics(self): """Recompute the statistics for the training dataset and save them to the feature store. """ if self.statistics_config.enabled and engine.get_type() == "spark": return self._statistics_engine.compute_statistics( self, self.read())
def compute_statistics(self, metadata_instance, feature_dataframe=None): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark": # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Hive engine # and calling compute_statistics feature_dataframe = (feature_dataframe if feature_dataframe else metadata_instance.read()) commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S") if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_str, content_str) self._statistics_api.post(metadata_instance, stats) return stats else: # Hive engine engine.get_instance().profile(metadata_instance)
def save( self, features: Union[query.Query, pd.DataFrame, TypeVar("pyspark.sql.DataFrame"), # noqa: F821 TypeVar("pyspark.RDD"), # noqa: F821 np.ndarray, List[list], ], write_options: Optional[Dict[Any, Any]] = {}, ): """Materialize the training dataset to storage. This method materializes the training dataset either from a Feature Store `Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python lists or Numpy ndarrays. From v2.5 onward, filters are saved along with the `Query`. # Arguments features: Feature data to be materialized. write_options: Additional write options as key-value pairs, defaults to `{}`. When using the `python` engine, write_options can contain the following entries: * key `spark` and value an object of type [hsfs.core.job_configuration.JobConfiguration](../job_configuration) to configure the Hopsworks Job used to compute the training dataset. * key `wait_for_job` and value `True` or `False` to configure whether or not to the save call should return only after the Hopsworks Job has finished. By default it waits. # Returns `Job`: When using the `python` engine, it returns the Hopsworks Job that was launched to create the training dataset. # Raises `RestAPIError`: Unable to create training dataset metadata. """ user_version = self._version user_stats_config = self._statistics_config # td_job is used only if the python engine is used training_dataset, td_job = self._training_dataset_engine.save( self, features, write_options) self.storage_connector = training_dataset.storage_connector # currently we do not save the training dataset statistics config for training datasets self.statistics_config = user_stats_config self._code_engine.save_code(self) if self.statistics_config.enabled and engine.get_type() == "spark": self.compute_statistics() if user_version is None: warnings.warn( "No version provided for creating training dataset `{}`, incremented version to `{}`." .format(self._name, self._version), util.VersionWarning, ) return td_job
def get_batch_query_string(self, feature_view_obj, start_time, end_time): query_obj = self._feature_view_api.get_batch_query( feature_view_obj.name, feature_view_obj.version, start_time, end_time, is_python_engine=engine.get_type() == "python", ) fs_query = self._query_constructor_api.construct_query(query_obj) if fs_query.pit_query is not None: return fs_query.pit_query return fs_query.query
def get_batch_query(self, feature_view_obj, start_time, end_time, with_label=False): return self._feature_view_api.get_batch_query( feature_view_obj.name, feature_view_obj.version, start_time, end_time, is_python_engine=engine.get_type() == "python", with_label=with_label, )
def save( self, features: Union[ pd.DataFrame, TypeVar("pyspark.sql.DataFrame"), # noqa: F821 TypeVar("pyspark.RDD"), # noqa: F821 np.ndarray, List[list], ], write_options: Optional[Dict[Any, Any]] = {}, ): """Persist the metadata and materialize the feature group to the feature store. Calling `save` creates the metadata for the feature group in the feature store and writes the specified `features` dataframe as feature group to the online/offline feature store as specified. By default, this writes the feature group to the offline storage, and if `online_enabled` for the feature group, also to the online feature store. The `features` dataframe can be a Spark DataFrame or RDD, a Pandas DataFrame, or a two-dimensional Numpy array or a two-dimensional Python nested list. # Arguments features: Query, DataFrame, RDD, Ndarray, list. Features to be saved. write_options: Additional write options for Spark as key-value pairs, defaults to `{}`. # Returns `FeatureGroup`. Returns the persisted `FeatureGroup` metadata object. # Raises `RestAPIError`. Unable to create feature group. """ feature_dataframe = engine.get_instance().convert_to_default_dataframe(features) user_version = self._version self._feature_group_engine.save(self, feature_dataframe, write_options) if self.statistics_config.enabled and engine.get_type() == "spark": # Only compute statistics if the engine is Spark. # For Hive engine, the computation happens in the Hopsworks application self._statistics_engine.compute_statistics(self, feature_dataframe) if user_version is None: warnings.warn( "No version provided for creating feature group `{}`, incremented version to `{}`.".format( self._name, self._version ), util.VersionWarning, ) return self
def get_query(self, online: bool = True, with_label: bool = False): """Returns the query used to generate this training dataset # Arguments online: boolean, optional. Return the query for the online storage, else for offline storage, defaults to `True` - for online storage. with_label: Indicator whether the query should contain features which were marked as prediction label/feature when the training dataset was created, defaults to `False`. # Returns `str`. Query string for the chosen storage used to generate this training dataset. """ return self._training_dataset_engine.query( self, online, with_label, engine.get_type() == "python")
def save( self, features: Union[ query.Query, pd.DataFrame, TypeVar("pyspark.sql.DataFrame"), # noqa: F821 TypeVar("pyspark.RDD"), # noqa: F821 np.ndarray, List[list], ], write_options: Optional[Dict[Any, Any]] = {}, ): """Materialize the training dataset to storage. This method materializes the training dataset either from a Feature Store `Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python lists or Numpy ndarrays. # Arguments features: Feature data to be materialized. write_options: Additional write options as key/value pairs. Defaults to `{}`. # Returns `TrainingDataset`: The updated training dataset metadata object, the previous `TrainingDataset` object on which you call `save` is also updated. # Raises `RestAPIError`: Unable to create training dataset metadata. """ user_version = self._version user_stats_config = self._statistics_config self._training_dataset_engine.save(self, features, write_options) # currently we do not save the training dataset statistics config for training datasets self.statistics_config = user_stats_config if self.statistics_config.enabled and engine.get_type() == "spark": self._statistics_engine.compute_statistics(self, self.read()) if user_version is None: warnings.warn( "No version provided for creating training dataset `{}`, incremented version to `{}`.".format( self._name, self._version ), util.VersionWarning, ) return self
def insert(self, training_dataset, features, user_write_options, overwrite): # validate matching schema if engine.get_type() == "spark": if isinstance(features, query.Query): dataframe = features.read() else: dataframe = features engine.get_instance().training_dataset_schema_match( dataframe, training_dataset.schema) engine.get_instance().write_training_dataset( training_dataset, features, user_write_options, self.OVERWRITE if overwrite else self.APPEND, )
def compute_statistics(self, metadata_instance, feature_dataframe=None, feature_group_commit_id=None): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark": # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Hive engine # and calling compute_statistics if feature_dataframe is None: if feature_group_commit_id is not None: feature_dataframe = (metadata_instance.select_all().as_of( util.get_hudi_datestr_from_timestamp( feature_group_commit_id)).read( online=False, dataframe_type="default", read_options={})) else: feature_dataframe = metadata_instance.read() commit_time = int( float(datetime.datetime.now().timestamp()) * 1000) if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_time, feature_group_commit_id, content_str) self._statistics_api.post(metadata_instance, stats) return stats else: # Hive engine engine.get_instance().profile(metadata_instance)
def compute_statistics( self, metadata_instance, feature_dataframe=None, feature_group_commit_id=None, feature_view_obj=None, ): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark" or feature_view_obj is not None: # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Python engine # and calling compute_statistics if feature_dataframe is None: if feature_group_commit_id is not None: feature_dataframe = (metadata_instance.select_all().as_of( util.get_hudi_datestr_from_timestamp( feature_group_commit_id)).read( online=False, dataframe_type="default", read_options={})) else: feature_dataframe = metadata_instance.read() commit_time = int( float(datetime.datetime.now().timestamp()) * 1000) content_str = self.profile_statistics(metadata_instance, feature_dataframe) if content_str: stats = statistics.Statistics( commit_time=commit_time, content=content_str, feature_group_commit_id=feature_group_commit_id, ) self._save_statistics(stats, metadata_instance, feature_view_obj) else: # Python engine engine.get_instance().profile_by_spark(metadata_instance)
def __init__( self, left_feature_group, left_features, feature_store_name=None, feature_store_id=None, left_feature_group_start_time=None, left_feature_group_end_time=None, joins=None, filter=None, ): self._feature_store_name = feature_store_name self._feature_store_id = feature_store_id self._left_feature_group = left_feature_group self._left_features = util.parse_features(left_features) self._left_feature_group_start_time = left_feature_group_start_time self._left_feature_group_end_time = left_feature_group_end_time self._joins = joins or [] self._filter = Logic.from_response_json(filter) self._python_engine = True if engine.get_type() == "python" else False self._query_constructor_api = query_constructor_api.QueryConstructorApi() self._storage_connector_api = storage_connector_api.StorageConnectorApi( feature_store_id )
def insert( self, features: Union[pd.DataFrame, TypeVar("pyspark.sql.DataFrame"), # noqa: F821 TypeVar("pyspark.RDD"), # noqa: F821 np.ndarray, List[list], ], overwrite: Optional[bool] = False, operation: Optional[str] = "upsert", storage: Optional[str] = None, write_options: Optional[Dict[Any, Any]] = {}, ): """Insert data from a dataframe into the feature group. Incrementally insert data to a feature group or overwrite all data contained in the feature group. By default, the data is inserted into the offline storage as well as the online storage if the feature group is `online_enabled=True`. To insert only into the online storage, set `storage="online"`, or oppositely `storage="offline"`. The `features` dataframe can be a Spark DataFrame or RDD, a Pandas DataFrame, or a two-dimensional Numpy array or a two-dimensional Python nested list. If statistics are enabled, statistics are recomputed for the entire feature group. If feature group's time travel format is `HUDI` then `operation` argument can be either `insert` or `upsert`. !!! example "Upsert new feature data with time travel format `HUDI`:" ```python fs = conn.get_feature_store(); fg = fs.get_feature_group("example_feature_group", 1) upsert_df = ... fg.insert(upsert_df) ``` # Arguments features: DataFrame, RDD, Ndarray, list. Features to be saved. overwrite: Drop all data in the feature group before inserting new data. This does not affect metadata, defaults to False. operation: Apache Hudi operation type `"insert"` or `"upsert"`. Defaults to `"upsert"`. storage: Overwrite default behaviour, write to offline storage only with `"offline"` or online only with `"online"`, defaults to `None`. write_options: Additional write options for Spark as key-value pairs, defaults to `{}`. # Returns `FeatureGroup`. Updated feature group metadata object. """ feature_dataframe = engine.get_instance().convert_to_default_dataframe( features) self._feature_group_engine.insert( self, feature_dataframe, overwrite, operation, storage.lower() if storage is not None else None, write_options, ) if engine.get_type() == "spark": # Only compute statistics if the engine is Spark, # if Hive, the statistics are computed by the application doing the insert self.compute_statistics()
def _check_feature_group_accessibility(self, feature_view_obj): if (engine.get_type() == "python" or engine.get_type() == "hive" ) and not feature_view_obj.query.from_cache_feature_group_only(): raise NotImplementedError( "Python kernel can only read from cached feature group." " Please use `feature_view.create_training_data` instead.")