Ejemplo n.º 1
0
    def _setup_hudi_read_opts(self, start_timestamp, end_timestamp, read_options):
        _hudi_commit_start_time = util.get_hudi_datestr_from_timestamp(start_timestamp)
        _hudi_commit_end_time = util.get_hudi_datestr_from_timestamp(end_timestamp)

        hudi_options = {
            self.HUDI_QUERY_TYPE_OPT_KEY: self.HUDI_QUERY_TYPE_INCREMENTAL_OPT_VAL,
            self.HUDI_BEGIN_INSTANTTIME_OPT_KEY: _hudi_commit_start_time,
            self.HUDI_END_INSTANTTIME_OPT_KEY: _hudi_commit_end_time,
        }

        if read_options:
            hudi_options.update(read_options)

        return hudi_options
    def commit_details(self, feature_group, wallclock_time, limit):
        if (feature_group._time_travel_format is None
                or feature_group._time_travel_format.upper() != "HUDI"):
            raise exceptions.FeatureStoreException(
                "commit_details can only be used on time travel enabled feature groups"
            )

        wallclock_timestamp = (
            util.get_timestamp_from_date_string(wallclock_time)
            if wallclock_time is not None else None)
        feature_group_commits = self._feature_group_api.get_commit_details(
            feature_group, wallclock_timestamp, limit)
        commit_details = {}
        for feature_group_commit in feature_group_commits:
            commit_details[feature_group_commit.commitid] = {
                "committedOn":
                util.get_hudi_datestr_from_timestamp(
                    feature_group_commit.commitid),
                "rowsUpdated":
                feature_group_commit.rows_updated,
                "rowsInserted":
                feature_group_commit.rows_inserted,
                "rowsDeleted":
                feature_group_commit.rows_deleted,
            }
        return commit_details
Ejemplo n.º 3
0
    def compute_statistics(self,
                           metadata_instance,
                           feature_dataframe=None,
                           feature_group_commit_id=None):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark":

            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Hive engine
            # and calling compute_statistics
            if feature_dataframe is None:
                if feature_group_commit_id is not None:
                    feature_dataframe = (metadata_instance.select_all().as_of(
                        util.get_hudi_datestr_from_timestamp(
                            feature_group_commit_id)).read(
                                online=False,
                                dataframe_type="default",
                                read_options={}))
                else:
                    feature_dataframe = metadata_instance.read()

            commit_time = int(
                float(datetime.datetime.now().timestamp()) * 1000)
            if len(feature_dataframe.head(1)) == 0:
                raise exceptions.FeatureStoreException(
                    "There is no data in the entity that you are trying to compute "
                    "statistics for. A possible cause might be that you inserted only data "
                    "to the online storage of a feature group.")
            content_str = engine.get_instance().profile(
                feature_dataframe,
                metadata_instance.statistics_config.columns,
                metadata_instance.statistics_config.correlations,
                metadata_instance.statistics_config.histograms,
            )
            stats = statistics.Statistics(commit_time, feature_group_commit_id,
                                          content_str)
            self._statistics_api.post(metadata_instance, stats)
            return stats

        else:
            # Hive engine
            engine.get_instance().profile(metadata_instance)
Ejemplo n.º 4
0
    def compute_statistics(
        self,
        metadata_instance,
        feature_dataframe=None,
        feature_group_commit_id=None,
        feature_view_obj=None,
    ):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark" or feature_view_obj is not None:
            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Python engine
            # and calling compute_statistics
            if feature_dataframe is None:
                if feature_group_commit_id is not None:
                    feature_dataframe = (metadata_instance.select_all().as_of(
                        util.get_hudi_datestr_from_timestamp(
                            feature_group_commit_id)).read(
                                online=False,
                                dataframe_type="default",
                                read_options={}))
                else:
                    feature_dataframe = metadata_instance.read()

            commit_time = int(
                float(datetime.datetime.now().timestamp()) * 1000)

            content_str = self.profile_statistics(metadata_instance,
                                                  feature_dataframe)
            if content_str:
                stats = statistics.Statistics(
                    commit_time=commit_time,
                    content=content_str,
                    feature_group_commit_id=feature_group_commit_id,
                )
                self._save_statistics(stats, metadata_instance,
                                      feature_view_obj)
        else:
            # Python engine
            engine.get_instance().profile_by_spark(metadata_instance)