コード例 #1
0
    def __init__(
        self,
        id,
        name,
        featurestore_id,
        description=None,
        # members specific to type of connector
        bootstrap_servers=None,
        security_protocol=None,
        ssl_truststore_location=None,
        ssl_truststore_password=None,
        ssl_keystore_location=None,
        ssl_keystore_password=None,
        ssl_key_password=None,
        ssl_endpoint_identification_algorithm=None,
        options=None,
    ):
        super().__init__(id, name, description, featurestore_id)

        # KAFKA
        self._bootstrap_servers = bootstrap_servers
        self._security_protocol = security_protocol
        self._ssl_truststore_location = engine.get_instance().add_file(
            ssl_truststore_location)
        self._ssl_truststore_password = ssl_truststore_password
        self._ssl_keystore_location = engine.get_instance().add_file(
            ssl_keystore_location)
        self._ssl_keystore_password = ssl_keystore_password
        self._ssl_key_password = ssl_key_password
        self._ssl_endpoint_identification_algorithm = (
            ssl_endpoint_identification_algorithm)
        self._options = (
            {option["name"]: option["value"]
             for option in options} if options is not None else {})
コード例 #2
0
    def save(self, features, write_options={}):
        # TODO: Decide if we want to have potentially dangerous defaults like {}
        if isinstance(features, query.Query):
            feature_dataframe = features.read("offline")
            self._querydto = features
        else:
            feature_dataframe = engine.get_instance(
            ).convert_to_default_dataframe(features)
            self._features = engine.get_instance(
            ).parse_schema_training_dataset(feature_dataframe)

        user_version = self._version
        user_stats_config = self._statistics_config
        self._training_dataset_engine.save(self, feature_dataframe,
                                           write_options)
        # currently we do not save the training dataset statistics config for training datasets
        self.statistics_config = user_stats_config
        if self.statistics_config.enabled:
            self._statistics_engine.compute_statistics(self, feature_dataframe)
        if user_version is None:
            warnings.warn(
                "No version provided for creating training dataset `{}`, incremented version to `{}`."
                .format(self._name, self._version),
                util.VersionWarning,
            )
        return self
コード例 #3
0
    def insert(self, feature_group, feature_dataframe, overwrite, storage,
               write_options):
        offline_write_options = write_options
        online_write_options = write_options

        if storage.lower() == "online" or storage.lower() == "all":
            # Add JDBC connection configuration in case of online feature group
            online_conn = self._storage_connector_api.get_online_connector()

            jdbc_options = online_conn.spark_options()
            jdbc_options["dbtable"] = self._get_online_table_name(
                feature_group)

            online_write_options = {**jdbc_options, **online_write_options}

        if (storage.lower() == "offline"
                or storage.lower() == "all") and overwrite:
            self._feature_group_api.delete_content(feature_group)

        engine.get_instance().save_dataframe(
            self._get_table_name(feature_group),
            feature_group.partition_key,
            feature_dataframe,
            self.APPEND,
            storage,
            offline_write_options,
            online_write_options,
        )
コード例 #4
0
 def insert(self, training_dataset, dataset, user_write_options, overwrite):
     engine.get_instance().write_training_dataset(
         training_dataset,
         dataset,
         user_write_options,
         self.OVERWRITE if overwrite else self.APPEND,
     )
コード例 #5
0
    def insert(
        self,
        feature_group,
        feature_dataframe,
        overwrite,
        operation,
        storage,
        write_options,
    ):
        validation_id = None
        if feature_group.validation_type != "NONE":
            validation = feature_group.validate(feature_dataframe)
            validation_id = validation.validation_id

        offline_write_options = write_options
        online_write_options = self.get_kafka_config(write_options)

        if not feature_group.online_enabled and storage == "online":
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group.")

        if overwrite:
            self._feature_group_api.delete_content(feature_group)

        engine.get_instance().save_dataframe(
            feature_group,
            feature_dataframe,
            "bulk_insert" if overwrite else operation,
            feature_group.online_enabled,
            storage,
            offline_write_options,
            online_write_options,
            validation_id,
        )
コード例 #6
0
    def save(
        self,
        features: Union[
            query.Query,
            pd.DataFrame,
            TypeVar("pyspark.sql.DataFrame"),  # noqa: F821
            TypeVar("pyspark.RDD"),  # noqa: F821
            np.ndarray,
            List[list],
        ],
        write_options: Optional[Dict[Any, Any]] = {},
    ):
        """Materialize the training dataset to storage.

        This method materializes the training dataset either from a Feature Store
        `Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python
        lists or Numpy ndarrays.

        # Arguments
            features: Feature data to be materialized.
            write_options: Additional write options as key/value pairs.
                Defaults to `{}`.

        # Returns
            `TrainingDataset`: The updated training dataset metadata object, the
                previous `TrainingDataset` object on which you call `save` is also
                updated.

        # Raises
            `RestAPIError`: Unable to create training dataset metadata.
        """
        if isinstance(features, query.Query):
            feature_dataframe = features.read()
            self._querydto = features
        else:
            feature_dataframe = engine.get_instance().convert_to_default_dataframe(
                features
            )

        self._features = engine.get_instance().parse_schema_training_dataset(
            feature_dataframe
        )

        self._set_label_features()

        user_version = self._version
        user_stats_config = self._statistics_config
        self._training_dataset_engine.save(self, feature_dataframe, write_options)
        # currently we do not save the training dataset statistics config for training datasets
        self.statistics_config = user_stats_config
        if self.statistics_config.enabled:
            self._statistics_engine.compute_statistics(self, feature_dataframe)
        if user_version is None:
            warnings.warn(
                "No version provided for creating training dataset `{}`, incremented version to `{}`.".format(
                    self._name, self._version
                ),
                util.VersionWarning,
            )
        return self
コード例 #7
0
    def compute_statistics(self, metadata_instance, feature_dataframe=None):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark":
            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Hive engine
            # and calling compute_statistics
            feature_dataframe = (feature_dataframe if feature_dataframe else
                                 metadata_instance.read())

            commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            if len(feature_dataframe.head(1)) == 0:
                raise exceptions.FeatureStoreException(
                    "There is no data in the entity that you are trying to compute "
                    "statistics for. A possible cause might be that you inserted only data "
                    "to the online storage of a feature group.")
            content_str = engine.get_instance().profile(
                feature_dataframe,
                metadata_instance.statistics_config.columns,
                metadata_instance.statistics_config.correlations,
                metadata_instance.statistics_config.histograms,
            )
            stats = statistics.Statistics(commit_str, content_str)
            self._statistics_api.post(metadata_instance, stats)
            return stats

        else:
            # Hive engine
            engine.get_instance().profile(metadata_instance)
コード例 #8
0
    def get_training_data(
        self,
        feature_view_obj,
        read_options=None,
        splits=[],
        training_dataset_obj=None,
        training_dataset_version=None,
    ):
        # check if provided td version has already existed.
        if training_dataset_version:
            td_updated = self._get_training_data_metadata(
                feature_view_obj, training_dataset_version)
        else:
            td_updated = self._create_training_data_metadata(
                feature_view_obj, training_dataset_obj)
        # check splits
        if len(splits) != len(td_updated.splits):
            if len(td_updated.splits) == 0:
                method_name = "get_training_data"
            elif len(td_updated.splits) == 2:
                method_name = "get_train_test_split"
            elif len(td_updated.splits) == 3:
                method_name = "get_train_validation_test_split"
            raise ValueError(
                f"Incorrect `get` method is used. Use `feature_view.{method_name}` instead."
            )

        read_options = engine.get_instance().read_options(
            td_updated.data_format, read_options)

        if td_updated.training_dataset_type != td_updated.IN_MEMORY:
            split_df = self._read_from_storage_connector(
                td_updated, td_updated.splits, read_options)
        else:
            self._check_feature_group_accessibility(feature_view_obj)
            query = self.get_batch_query(
                feature_view_obj,
                start_time=td_updated.event_start_time,
                end_time=td_updated.event_end_time,
                with_label=True,
            )
            split_df = engine.get_instance().get_training_data(
                td_updated, feature_view_obj, query, read_options)
            self.compute_training_dataset_statistics(feature_view_obj,
                                                     td_updated,
                                                     split_df,
                                                     calc_stat=True)

        # split df into features and labels df
        if td_updated.splits:
            for split in td_updated.splits:
                split_name = split.name
                split_df[split_name] = engine.get_instance().split_labels(
                    split_df[split_name], feature_view_obj.labels)
        else:
            split_df = engine.get_instance().split_labels(
                split_df, feature_view_obj.labels)

        return td_updated, split_df
コード例 #9
0
 def show(self, n):
     """Show the first n rows of the feature group."""
     engine.get_instance().set_job_group(
         "Fetching Feature group",
         "Getting feature group: {} from the featurestore {}".format(
             self._name, self._feature_store_name),
     )
     return self.select_all().show(n)
コード例 #10
0
 def read(self, dataframe_type="default"):
     """Get the feature group as a DataFrame."""
     engine.get_instance().set_job_group(
         "Fetching Feature group",
         "Getting feature group: {} from the featurestore {}".format(
             self._name, self._feature_store_name),
     )
     return self.select_all().read(dataframe_type=dataframe_type)
コード例 #11
0
    def register_on_demand(self):
        if self._on_demand_fg_aliases is None:
            return

        for on_demand_fg_alias in self._on_demand_fg_aliases:
            engine.get_instance().register_on_demand_temporary_table(
                on_demand_fg_alias.on_demand_feature_group,
                on_demand_fg_alias.alias,
            )
コード例 #12
0
 def commit_delete(feature_group, delete_df, write_options):
     hudi_engine_instance = hudi_engine.HudiEngine(
         feature_group.feature_store_id,
         feature_group.feature_store_name,
         feature_group,
         engine.get_instance()._spark_context,
         engine.get_instance()._spark_session,
     )
     return hudi_engine_instance.delete_record(delete_df, write_options)
コード例 #13
0
    def save(self, feature_group):
        if len(feature_group.features) == 0:
            # If the user didn't specify the schema, parse it from the query
            on_demand_dataset = (
                engine.get_instance().register_on_demand_temporary_table(
                    feature_group, "read_ondmd"))
            feature_group._features = engine.get_instance(
            ).parse_schema_feature_group(on_demand_dataset)

        self._feature_group_api.save(feature_group)
コード例 #14
0
    def _register_on_demand(self, on_demand_fg_aliases):
        if on_demand_fg_aliases is None:
            return

        for on_demand_fg_alias in on_demand_fg_aliases:
            engine.get_instance().register_on_demand_temporary_table(
                on_demand_fg_alias.on_demand_feature_group.query,
                on_demand_fg_alias.on_demand_feature_group.storage_connector,
                on_demand_fg_alias.alias,
            )
コード例 #15
0
    def save(self, feature_group):
        if len(feature_group.features) == 0:
            # If the user didn't specify the schema, parse it from the query
            on_demand_dataset = engine.get_instance().sql(
                feature_group.query, None, feature_group.storage_connector, "default"
            )
            feature_group._features = engine.get_instance().parse_schema_feature_group(
                on_demand_dataset
            )

        self._feature_group_api.save(feature_group)
コード例 #16
0
 def read(self, training_dataset, split, user_read_options):
     read_options = engine.get_instance().read_options(
         training_dataset.data_format, user_read_options
     )
     return engine.get_instance().read(
         training_dataset.storage_connector,
         training_dataset.data_format,
         read_options,
         training_dataset.location,
         split,
     )
コード例 #17
0
    def save(self, feature_group, feature_dataframe, write_options):

        if len(feature_group.features) == 0:
            # User didn't provide a schema. extract it from the dataframe
            feature_group._features = engine.get_instance(
            ).parse_schema_feature_group(feature_dataframe)

        # set primary and partition key columns
        # we should move this to the backend
        for feat in feature_group.features:
            if feat.name in feature_group.primary_key:
                feat.primary = True
            if feat.name in feature_group.partition_key:
                feat.partition = True
            if (feature_group.hudi_precombine_key is not None
                    and feat.name == feature_group.hudi_precombine_key):
                feat.hudi_precombine_key = True

        self._feature_group_api.save(feature_group)
        validation_id = None
        if feature_group.validation_type != "NONE":
            validation = feature_group.validate(feature_dataframe)
            validation_id = validation.validation_id

        offline_write_options = write_options
        online_write_options = write_options

        table_name = self._get_table_name(feature_group)

        if feature_group.online_enabled:
            # Add JDBC connection configuration in case of online feature group
            online_conn = self._storage_connector_api.get_online_connector()

            jdbc_options = online_conn.spark_options()
            jdbc_options["dbtable"] = self._get_online_table_name(
                feature_group)

            online_write_options = {**jdbc_options, **online_write_options}

        engine.get_instance().save_dataframe(
            table_name,
            feature_group,
            feature_dataframe,
            self.APPEND,
            hudi_engine.HudiEngine.HUDI_BULK_INSERT
            if feature_group.time_travel_format == "HUDI" else None,
            feature_group.online_enabled,
            None,
            offline_write_options,
            online_write_options,
            validation_id,
        )
コード例 #18
0
    def append_features(self, feature_group, new_features):
        """Appends features to a feature group."""
        # first get empty dataframe of current version and append new feature
        # necessary to write empty df to the table in order for the parquet schema
        # which is used by hudi to be updated
        df = engine.get_instance().get_empty_appended_dataframe(
            feature_group.read(), new_features)

        self._update_features_metadata(feature_group,
                                       feature_group.features + new_features)

        # write empty dataframe to update parquet schema
        engine.get_instance().save_empty_dataframe(feature_group, df)
コード例 #19
0
    def show(self, n: int, online: Optional[bool] = False):
        """Show the first `n` rows of the feature group.

        # Arguments
            n: int. Number of rows to show.
            online: bool, optional. If `True` read from online feature store, defaults
                to `False`.
        """
        engine.get_instance().set_job_group(
            "Fetching Feature group",
            "Getting feature group: {} from the featurestore {}".format(
                self._name, self._feature_store_name),
        )
        return self.select_all().show(n, online)
コード例 #20
0
    def insert(self, training_dataset, feature_dataframe, user_write_options,
               overwrite):
        # validate matching schema
        engine.get_instance().schema_matches(feature_dataframe,
                                             training_dataset.schema)

        write_options = engine.get_instance().write_options(
            training_dataset.data_format, user_write_options)

        self._write(
            training_dataset,
            feature_dataframe,
            write_options,
            self.OVERWRITE if overwrite else self.APPEND,
        )
コード例 #21
0
    def read(self, training_dataset, split, user_read_options):
        if split is None:
            path = training_dataset.location + "/" + "**"
        else:
            path = training_dataset.location + "/" + str(split)

        read_options = engine.get_instance().read_options(
            training_dataset.data_format, user_read_options)

        return engine.get_instance().read(
            training_dataset.storage_connector,
            training_dataset.data_format,
            read_options,
            path,
        )
コード例 #22
0
    def read(
        self,
        online: Optional[bool] = False,
        dataframe_type: Optional[str] = "default",
        read_options: Optional[dict] = {},
    ):
        """Read the specified query into a DataFrame.

        It is possible to specify the storage (online/offline) to read from and the
        type of the output DataFrame (Spark, Pandas, Numpy, Python Lists).

        # Arguments
            online: Read from online storage. Defaults to `False`.
            dataframe_type: DataFrame type to return. Defaults to `"default"`.
            read_options: Optional dictionary with read options for Spark.
                Defaults to `{}`.

        # Returns
            `DataFrame`: DataFrame depending on the chosen type.
        """
        sql_query, online_conn = self._prep_read(online, read_options)

        return engine.get_instance().sql(
            sql_query,
            self._feature_store_name,
            online_conn,
            dataframe_type,
            read_options,
        )
コード例 #23
0
    def read(
        self,
        query: str = None,
        data_format: str = None,
        options: dict = {},
        path: str = None,
    ):
        """Reads results from BigQuery into a spark dataframe using the storage connector.

          Reading from bigquery is done via either specifying the BigQuery table or BigQuery query.
          For example, to read from a BigQuery table, set the BigQuery project, dataset and table on storage connector
          and read directly from the corresponding path.
            ```python
            conn.read()
            ```
          OR, to read results from a BigQuery query, set `Materialization Dataset` on storage connector,
           and pass your SQL to `query` argument.
            ```python
            conn.read(query='SQL')
            ```
          Optionally, passing `query` argument will take priority at runtime if the table options were also set
          on the storage connector. This allows user to run from both a query or table with same connector, assuming
          all fields were set.
          Also, user can set the `path` argument to a bigquery table path to read at runtime,
           if table options were not set initially while creating the connector.
            ```python
            conn.read(path='project.dataset.table')
            ```

        # Arguments
            query: BigQuery query. Defaults to `None`.
            data_format: Spark data format. Defaults to `None`.
            options: Spark options. Defaults to `None`.
            path: BigQuery table path. Defaults to `None`.

        # Raises
            `ValueError`: Malformed arguments.

        # Returns
            `Dataframe`: A Spark dataframe.
        """

        # merge user spark options on top of default spark options
        options = ({
            **self.spark_options(),
            **options
        } if options is not None else self.spark_options())
        if query:
            path = query
        elif self._query_table:
            path = self._query_table
        elif path:
            pass
        else:
            raise ValueError(
                "Either query should be provided "
                "or Query Project,Dataset and Table should be set")

        return engine.get_instance().read(self, self.BIGQUERY_FORMAT, options,
                                          path)
コード例 #24
0
    def read(
        self,
        query: str = None,
        data_format: str = None,
        options: dict = {},
        path: str = None,
    ):
        """Reads GCS path into a dataframe using the storage connector.

        ```python
        conn.read(data_format='spark_formats',path='gs://BUCKET/DATA')
        ```

        # Arguments
            data_format: Spark data format. Defaults to `None`.
            options: Spark options. Defaults to `None`.
            path: GCS path. Defaults to `None`.
        # Raises
            `ValueError`: Malformed arguments.

        # Returns
            `Dataframe`: A Spark dataframe.
        """

        return engine.get_instance().read(self, data_format, options, path)
コード例 #25
0
    def insert_stream(
        self,
        feature_group,
        dataframe,
        query_name,
        output_mode,
        await_termination,
        timeout,
        write_options,
    ):
        if not feature_group.online_enabled:
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group. "
                "It is currently only possible to stream to the online storage."
            )

        if feature_group.validation_type != "NONE":
            warnings.warn(
                "Stream ingestion for feature group `{}`, with version `{}` will not perform validation."
                .format(feature_group.name, feature_group.version),
                util.ValidationWarning,
            )

        return engine.get_instance().save_stream_dataframe(
            feature_group,
            dataframe,
            query_name,
            output_mode,
            await_termination,
            timeout,
            self.get_kafka_config(write_options),
        )
コード例 #26
0
 def sql(self, query, feature_store_name, dataframe_type, online):
     if online:
         online_conn = self._storage_connector_api.get_online_connector()
     else:
         online_conn = None
     return engine.get_instance().sql(query, feature_store_name,
                                      online_conn, dataframe_type)
コード例 #27
0
    def save(self, feature_group, feature_dataframe, write_options,
             validation_options):

        self._save_feature_group_metadata(feature_group, feature_dataframe,
                                          write_options)

        # deequ validation only on spark
        validation = feature_group._data_validation_engine.ingest_validate(
            feature_group, feature_dataframe)
        validation_id = validation.validation_id if validation is not None else None

        # ge validation on python and non stream feature groups on spark
        ge_report = feature_group._great_expectation_engine.validate(
            feature_group, feature_dataframe, True, validation_options)

        if ge_report is not None and ge_report.ingestion_result == "REJECTED":
            return None, ge_report

        offline_write_options = write_options
        online_write_options = self.get_kafka_config(write_options)

        return (
            engine.get_instance().save_dataframe(
                feature_group,
                feature_dataframe,
                hudi_engine.HudiEngine.HUDI_BULK_INSERT
                if feature_group.time_travel_format == "HUDI" else None,
                feature_group.online_enabled,
                None,
                offline_write_options,
                online_write_options,
                validation_id,
            ),
            ge_report,
        )
コード例 #28
0
    def _save_feature_group_metadata(self, feature_group, feature_dataframe,
                                     write_options):

        # this means FG doesn't exist and should create the new one
        if len(feature_group.features) == 0:
            # User didn't provide a schema. extract it from the dataframe
            feature_group._features = engine.get_instance(
            ).parse_schema_feature_group(feature_dataframe)

        # set primary and partition key columns
        # we should move this to the backend
        for feat in feature_group.features:
            if feat.name in feature_group.primary_key:
                feat.primary = True
            if feat.name in feature_group.partition_key:
                feat.partition = True
            if (feature_group.hudi_precombine_key is not None
                    and feat.name == feature_group.hudi_precombine_key):
                feat.hudi_precombine_key = True

        if feature_group.stream:
            feature_group._options = write_options

        self._feature_group_api.save(feature_group)
        print("Feature Group created successfully, explore it at \n" +
              self._get_feature_group_url(feature_group))
コード例 #29
0
    def save(self, feature_group):
        if len(feature_group.features) == 0:
            # If the user didn't specify the schema, parse it from the query
            on_demand_dataset = (
                engine.get_instance().register_on_demand_temporary_table(
                    feature_group, "read_ondmd"))
            feature_group._features = engine.get_instance(
            ).parse_schema_feature_group(on_demand_dataset)

        # set primary and partition key columns
        # we should move this to the backend
        for feat in feature_group.features:
            if feat.name in feature_group.primary_key:
                feat.primary = True

        self._feature_group_api.save(feature_group)
コード例 #30
0
    def show(self, n: int, online: Optional[bool] = False):
        """Show the first N rows of the Query.

        # Arguments
            n: Number of rows to show.
            online: Show from online storage. Defaults to `False`.
        """
        query = self._query_constructor_api.construct_query(self)

        if online:
            sql_query = query.query_online
            online_conn = self._storage_connector_api.get_online_connector()
        else:
            sql_query = query.query
            online_conn = None

            # Register on demand feature groups as temporary tables
            self._register_on_demand(query.on_demand_fg_aliases)

            # Register on hudi feature groups as temporary tables
            self._register_hudi_tables(
                query.hudi_cached_feature_groups,
                self._feature_store_id,
                self._feature_store_name,
                {},
            )

        return engine.get_instance().show(sql_query, self._feature_store_name,
                                          n, online_conn)