Exemple #1
0
    def validate(self, feature_group, dataframe, save_report, validation_options):

        suite = feature_group.get_expectation_suite(False)
        if suite is not None:
            run_validation = validation_options.get(
                "run_validation", suite.run_validation
            )
            if (
                run_validation
                and engine.get_type() == "python"
                and feature_group.stream
            ) or (
                run_validation
                and engine.get_type() == "spark"
                and not feature_group.stream
            ):
                report = engine.get_instance().validate_with_great_expectations(
                    dataframe=dataframe,
                    expectation_suite=suite.to_ge_type(),
                    ge_validate_kwargs=validation_options.get("ge_validate_kwargs", {}),
                )

                save_report = validation_options.get("save_report", save_report)
                if save_report:
                    return feature_group.save_validation_report(report, ge_type=False)

                return validation_report.ValidationReport(**report.to_json_dict())
        return
Exemple #2
0
    def compute_training_dataset(
        self,
        feature_view_obj,
        user_write_options,
        training_dataset_obj=None,
        training_dataset_version=None,
    ):
        if training_dataset_obj:
            pass
        elif training_dataset_version:
            training_dataset_obj = self._get_training_data_metadata(
                feature_view_obj, training_dataset_version)
        else:
            raise ValueError(
                "No training dataset object or version is provided")

        batch_query = self.get_batch_query(
            feature_view_obj,
            training_dataset_obj.event_start_time,
            training_dataset_obj.event_end_time,
            with_label=True,
        )
        td_job = engine.get_instance().write_training_dataset(
            training_dataset_obj,
            batch_query,
            user_write_options,
            self._OVERWRITE,
            feature_view_obj=feature_view_obj,
        )
        self._td_code_engine.save_code(training_dataset_obj)
        if engine.get_type() == "spark":
            if training_dataset_obj.splits:
                td_df = dict([(
                    split.name,
                    self._training_dataset_engine.read(training_dataset_obj,
                                                       split.name, {}),
                ) for split in training_dataset_obj.splits])
            else:
                td_df = self._training_dataset_engine.read(
                    training_dataset_obj, None, {})
        else:
            td_df = None
        # currently we do not save the training dataset statistics config for training datasets
        self.compute_training_dataset_statistics(
            feature_view_obj,
            training_dataset_obj,
            td_df,
            calc_stat=engine.get_type() == "spark",
        )
        return td_job
Exemple #3
0
 def ingest_validate(self, feature_group, feature_dataframe):
     if feature_group.validation_type != "NONE" and engine.get_type(
     ) == "spark":
         # If the engine is Python, the validation will be executed by
         # the Hopsworks job ingesting the data
         return self.validate(feature_group, feature_dataframe, True)
     return None
Exemple #4
0
    def profile_transformation_fn_statistics(self, feature_dataframe, columns,
                                             label_encoder_features):
        if (engine.get_type() == "spark"
                and len(feature_dataframe.select(*columns).head(1)) == 0) or (
                    (engine.get_type() == "hive" or engine.get_type()
                     == "python") and len(feature_dataframe.head()) == 0):
            raise exceptions.FeatureStoreException(
                "There is no data in the entity that you are trying to compute "
                "statistics for. A possible cause might be that you inserted only data "
                "to the online storage of a feature group.")
        content_str = engine.get_instance().profile(feature_dataframe, columns,
                                                    False, True, False)

        # add unique value profile to String type columns
        return self.profile_unique_values(feature_dataframe,
                                          label_encoder_features, content_str)
Exemple #5
0
 def compute_statistics(self):
     """Recompute the statistics for the training dataset and save them to the
     feature store.
     """
     if self.statistics_config.enabled and engine.get_type() == "spark":
         return self._statistics_engine.compute_statistics(
             self, self.read())
    def compute_statistics(self, metadata_instance, feature_dataframe=None):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark":
            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Hive engine
            # and calling compute_statistics
            feature_dataframe = (feature_dataframe if feature_dataframe else
                                 metadata_instance.read())

            commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            if len(feature_dataframe.head(1)) == 0:
                raise exceptions.FeatureStoreException(
                    "There is no data in the entity that you are trying to compute "
                    "statistics for. A possible cause might be that you inserted only data "
                    "to the online storage of a feature group.")
            content_str = engine.get_instance().profile(
                feature_dataframe,
                metadata_instance.statistics_config.columns,
                metadata_instance.statistics_config.correlations,
                metadata_instance.statistics_config.histograms,
            )
            stats = statistics.Statistics(commit_str, content_str)
            self._statistics_api.post(metadata_instance, stats)
            return stats

        else:
            # Hive engine
            engine.get_instance().profile(metadata_instance)
    def save(
        self,
        features: Union[query.Query, pd.DataFrame,
                        TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
                        TypeVar("pyspark.RDD"),  # noqa: F821
                        np.ndarray, List[list], ],
        write_options: Optional[Dict[Any, Any]] = {},
    ):
        """Materialize the training dataset to storage.

        This method materializes the training dataset either from a Feature Store
        `Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python
        lists or Numpy ndarrays.
        From v2.5 onward, filters are saved along with the `Query`.

        # Arguments
            features: Feature data to be materialized.
            write_options: Additional write options as key-value pairs, defaults to `{}`.
                When using the `python` engine, write_options can contain the
                following entries:
                * key `spark` and value an object of type
                [hsfs.core.job_configuration.JobConfiguration](../job_configuration)
                  to configure the Hopsworks Job used to compute the training dataset.
                * key `wait_for_job` and value `True` or `False` to configure
                  whether or not to the save call should return only
                  after the Hopsworks Job has finished. By default it waits.

        # Returns
            `Job`: When using the `python` engine, it returns the Hopsworks Job
                that was launched to create the training dataset.

        # Raises
            `RestAPIError`: Unable to create training dataset metadata.
        """
        user_version = self._version
        user_stats_config = self._statistics_config
        # td_job is used only if the python engine is used
        training_dataset, td_job = self._training_dataset_engine.save(
            self, features, write_options)
        self.storage_connector = training_dataset.storage_connector
        # currently we do not save the training dataset statistics config for training datasets
        self.statistics_config = user_stats_config
        self._code_engine.save_code(self)
        if self.statistics_config.enabled and engine.get_type() == "spark":
            self.compute_statistics()
        if user_version is None:
            warnings.warn(
                "No version provided for creating training dataset `{}`, incremented version to `{}`."
                .format(self._name, self._version),
                util.VersionWarning,
            )

        return td_job
Exemple #8
0
 def get_batch_query_string(self, feature_view_obj, start_time, end_time):
     query_obj = self._feature_view_api.get_batch_query(
         feature_view_obj.name,
         feature_view_obj.version,
         start_time,
         end_time,
         is_python_engine=engine.get_type() == "python",
     )
     fs_query = self._query_constructor_api.construct_query(query_obj)
     if fs_query.pit_query is not None:
         return fs_query.pit_query
     return fs_query.query
Exemple #9
0
 def get_batch_query(self,
                     feature_view_obj,
                     start_time,
                     end_time,
                     with_label=False):
     return self._feature_view_api.get_batch_query(
         feature_view_obj.name,
         feature_view_obj.version,
         start_time,
         end_time,
         is_python_engine=engine.get_type() == "python",
         with_label=with_label,
     )
    def save(
        self,
        features: Union[
            pd.DataFrame,
            TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
            TypeVar("pyspark.RDD"),  # noqa: F821
            np.ndarray,
            List[list],
        ],
        write_options: Optional[Dict[Any, Any]] = {},
    ):
        """Persist the metadata and materialize the feature group to the feature store.

        Calling `save` creates the metadata for the feature group in the feature store
        and writes the specified `features` dataframe as feature group to the
        online/offline feature store as specified.

        By default, this writes the feature group to the offline storage, and if
        `online_enabled` for the feature group, also to the online feature store.

        The `features` dataframe can be a Spark DataFrame or RDD, a Pandas DataFrame,
        or a two-dimensional Numpy array or a two-dimensional Python nested list.

        # Arguments
            features: Query, DataFrame, RDD, Ndarray, list. Features to be saved.
            write_options: Additional write options for Spark as
                key-value pairs, defaults to `{}`.

        # Returns
            `FeatureGroup`. Returns the persisted `FeatureGroup` metadata object.

        # Raises
            `RestAPIError`. Unable to create feature group.
        """
        feature_dataframe = engine.get_instance().convert_to_default_dataframe(features)

        user_version = self._version
        self._feature_group_engine.save(self, feature_dataframe, write_options)
        if self.statistics_config.enabled and engine.get_type() == "spark":
            # Only compute statistics if the engine is Spark.
            # For Hive engine, the computation happens in the Hopsworks application
            self._statistics_engine.compute_statistics(self, feature_dataframe)
        if user_version is None:
            warnings.warn(
                "No version provided for creating feature group `{}`, incremented version to `{}`.".format(
                    self._name, self._version
                ),
                util.VersionWarning,
            )
        return self
    def get_query(self, online: bool = True, with_label: bool = False):
        """Returns the query used to generate this training dataset

        # Arguments
            online: boolean, optional. Return the query for the online storage, else
                for offline storage, defaults to `True` - for online storage.
            with_label: Indicator whether the query should contain features which were
                marked as prediction label/feature when the training dataset was
                created, defaults to `False`.

        # Returns
            `str`. Query string for the chosen storage used to generate this training
                dataset.
        """
        return self._training_dataset_engine.query(
            self, online, with_label,
            engine.get_type() == "python")
Exemple #12
0
    def save(
        self,
        features: Union[
            query.Query,
            pd.DataFrame,
            TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
            TypeVar("pyspark.RDD"),  # noqa: F821
            np.ndarray,
            List[list],
        ],
        write_options: Optional[Dict[Any, Any]] = {},
    ):
        """Materialize the training dataset to storage.

        This method materializes the training dataset either from a Feature Store
        `Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python
        lists or Numpy ndarrays.

        # Arguments
            features: Feature data to be materialized.
            write_options: Additional write options as key/value pairs.
                Defaults to `{}`.

        # Returns
            `TrainingDataset`: The updated training dataset metadata object, the
                previous `TrainingDataset` object on which you call `save` is also
                updated.

        # Raises
            `RestAPIError`: Unable to create training dataset metadata.
        """
        user_version = self._version
        user_stats_config = self._statistics_config
        self._training_dataset_engine.save(self, features, write_options)
        # currently we do not save the training dataset statistics config for training datasets
        self.statistics_config = user_stats_config
        if self.statistics_config.enabled and engine.get_type() == "spark":
            self._statistics_engine.compute_statistics(self, self.read())
        if user_version is None:
            warnings.warn(
                "No version provided for creating training dataset `{}`, incremented version to `{}`.".format(
                    self._name, self._version
                ),
                util.VersionWarning,
            )
        return self
    def insert(self, training_dataset, features, user_write_options,
               overwrite):
        # validate matching schema
        if engine.get_type() == "spark":
            if isinstance(features, query.Query):
                dataframe = features.read()
            else:
                dataframe = features

            engine.get_instance().training_dataset_schema_match(
                dataframe, training_dataset.schema)

        engine.get_instance().write_training_dataset(
            training_dataset,
            features,
            user_write_options,
            self.OVERWRITE if overwrite else self.APPEND,
        )
Exemple #14
0
    def compute_statistics(self,
                           metadata_instance,
                           feature_dataframe=None,
                           feature_group_commit_id=None):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark":

            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Hive engine
            # and calling compute_statistics
            if feature_dataframe is None:
                if feature_group_commit_id is not None:
                    feature_dataframe = (metadata_instance.select_all().as_of(
                        util.get_hudi_datestr_from_timestamp(
                            feature_group_commit_id)).read(
                                online=False,
                                dataframe_type="default",
                                read_options={}))
                else:
                    feature_dataframe = metadata_instance.read()

            commit_time = int(
                float(datetime.datetime.now().timestamp()) * 1000)
            if len(feature_dataframe.head(1)) == 0:
                raise exceptions.FeatureStoreException(
                    "There is no data in the entity that you are trying to compute "
                    "statistics for. A possible cause might be that you inserted only data "
                    "to the online storage of a feature group.")
            content_str = engine.get_instance().profile(
                feature_dataframe,
                metadata_instance.statistics_config.columns,
                metadata_instance.statistics_config.correlations,
                metadata_instance.statistics_config.histograms,
            )
            stats = statistics.Statistics(commit_time, feature_group_commit_id,
                                          content_str)
            self._statistics_api.post(metadata_instance, stats)
            return stats

        else:
            # Hive engine
            engine.get_instance().profile(metadata_instance)
Exemple #15
0
    def compute_statistics(
        self,
        metadata_instance,
        feature_dataframe=None,
        feature_group_commit_id=None,
        feature_view_obj=None,
    ):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark" or feature_view_obj is not None:
            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Python engine
            # and calling compute_statistics
            if feature_dataframe is None:
                if feature_group_commit_id is not None:
                    feature_dataframe = (metadata_instance.select_all().as_of(
                        util.get_hudi_datestr_from_timestamp(
                            feature_group_commit_id)).read(
                                online=False,
                                dataframe_type="default",
                                read_options={}))
                else:
                    feature_dataframe = metadata_instance.read()

            commit_time = int(
                float(datetime.datetime.now().timestamp()) * 1000)

            content_str = self.profile_statistics(metadata_instance,
                                                  feature_dataframe)
            if content_str:
                stats = statistics.Statistics(
                    commit_time=commit_time,
                    content=content_str,
                    feature_group_commit_id=feature_group_commit_id,
                )
                self._save_statistics(stats, metadata_instance,
                                      feature_view_obj)
        else:
            # Python engine
            engine.get_instance().profile_by_spark(metadata_instance)
 def __init__(
     self,
     left_feature_group,
     left_features,
     feature_store_name=None,
     feature_store_id=None,
     left_feature_group_start_time=None,
     left_feature_group_end_time=None,
     joins=None,
     filter=None,
 ):
     self._feature_store_name = feature_store_name
     self._feature_store_id = feature_store_id
     self._left_feature_group = left_feature_group
     self._left_features = util.parse_features(left_features)
     self._left_feature_group_start_time = left_feature_group_start_time
     self._left_feature_group_end_time = left_feature_group_end_time
     self._joins = joins or []
     self._filter = Logic.from_response_json(filter)
     self._python_engine = True if engine.get_type() == "python" else False
     self._query_constructor_api = query_constructor_api.QueryConstructorApi()
     self._storage_connector_api = storage_connector_api.StorageConnectorApi(
         feature_store_id
     )
    def insert(
        self,
        features: Union[pd.DataFrame,
                        TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
                        TypeVar("pyspark.RDD"),  # noqa: F821
                        np.ndarray, List[list], ],
        overwrite: Optional[bool] = False,
        operation: Optional[str] = "upsert",
        storage: Optional[str] = None,
        write_options: Optional[Dict[Any, Any]] = {},
    ):
        """Insert data from a dataframe into the feature group.

        Incrementally insert data to a feature group or overwrite all data contained
        in the feature group. By default, the data is inserted into the offline storage
        as well as the online storage if the feature group is `online_enabled=True`. To
        insert only into the online storage, set `storage="online"`, or oppositely
        `storage="offline"`.

        The `features` dataframe can be a Spark DataFrame or RDD, a Pandas DataFrame,
        or a two-dimensional Numpy array or a two-dimensional Python nested list.

        If statistics are enabled, statistics are recomputed for the entire feature
        group.

        If feature group's time travel format is `HUDI` then `operation` argument can be
        either `insert` or `upsert`.

        !!! example "Upsert new feature data with time travel format `HUDI`:"
            ```python
            fs = conn.get_feature_store();
            fg = fs.get_feature_group("example_feature_group", 1)
            upsert_df = ...
            fg.insert(upsert_df)
            ```

        # Arguments
            features: DataFrame, RDD, Ndarray, list. Features to be saved.
            overwrite: Drop all data in the feature group before
                inserting new data. This does not affect metadata, defaults to False.
            operation: Apache Hudi operation type `"insert"` or `"upsert"`.
                Defaults to `"upsert"`.
            storage: Overwrite default behaviour, write to offline
                storage only with `"offline"` or online only with `"online"`, defaults
                to `None`.
            write_options: Additional write options for Spark as
                key-value pairs, defaults to `{}`.

        # Returns
            `FeatureGroup`. Updated feature group metadata object.
        """
        feature_dataframe = engine.get_instance().convert_to_default_dataframe(
            features)

        self._feature_group_engine.insert(
            self,
            feature_dataframe,
            overwrite,
            operation,
            storage.lower() if storage is not None else None,
            write_options,
        )

        if engine.get_type() == "spark":
            # Only compute statistics if the engine is Spark,
            # if Hive, the statistics are computed by the application doing the insert
            self.compute_statistics()
Exemple #18
0
 def _check_feature_group_accessibility(self, feature_view_obj):
     if (engine.get_type() == "python" or engine.get_type() == "hive"
         ) and not feature_view_obj.query.from_cache_feature_group_only():
         raise NotImplementedError(
             "Python kernel can only read from cached feature group."
             " Please use `feature_view.create_training_data` instead.")