def setup_databricks(
        cls,
        host,
        project,
        port=443,
        region_name="default",
        secrets_store="parameterstore",
        cert_folder="hops",
        hostname_verification=True,
        trust_store_path=None,
        api_key_file=None,
    ):
        connection = cls(
            host,
            port,
            project,
            region_name,
            secrets_store,
            hostname_verification,
            trust_store_path,
            cert_folder,
            api_key_file,
        )

        dbfs_folder = client.get_instance()._cert_folder_base

        os.makedirs(os.path.join(dbfs_folder, "scripts"), exist_ok=True)
        connection._get_clients(dbfs_folder)
        hive_host = connection._get_hivemetastore_hostname()
        connection._write_init_script(dbfs_folder)
        connection._print_instructions(
            cert_folder, client.get_instance()._cert_folder, hive_host
        )

        return connection
    def _get_kafka_config(self, write_options: dict = {}) -> dict:
        # producer configuration properties
        # https://docs.confluent.io/platform/current/clients/librdkafka/html/md_CONFIGURATION.html
        config = {
            "security.protocol": "SSL",
            "ssl.ca.location": client.get_instance()._get_ca_chain_path(),
            "ssl.certificate.location": client.get_instance()._get_client_cert_path(),
            "ssl.key.location": client.get_instance()._get_client_key_path(),
            "client.id": socket.gethostname(),
            **write_options.get("kafka_producer_config", {}),
        }

        if isinstance(client.get_instance(), hopsworks.Client) or write_options.get(
            "internal_kafka", False
        ):
            config["bootstrap.servers"] = ",".join(
                [
                    endpoint.replace("INTERNAL://", "")
                    for endpoint in self._kafka_api.get_broker_endpoints(
                        externalListeners=False
                    )
                ]
            )
        elif isinstance(client.get_instance(), external.Client):
            config["bootstrap.servers"] = ",".join(
                [
                    endpoint.replace("EXTERNAL://", "")
                    for endpoint in self._kafka_api.get_broker_endpoints(
                        externalListeners=True
                    )
                ]
            )
        return config
Example #3
0
def get_hostname_replaced_url(sub_path: str):
    """
    construct and return an url with public hopsworks hostname and sub path
    :param self:
    :param sub_path: url sub-path after base url
    :return: href url
    """
    href = urljoin(client.get_instance()._base_url, sub_path)
    url_parsed = client.get_instance().replace_public_host(urlparse(href))
    return url_parsed.geturl()
Example #4
0
    def _get_conn_str(self):
        credentials = {
            "sslTrustStore": client.get_instance()._get_jks_trust_store_path(),
            "trustStorePassword": client.get_instance()._cert_key,
            "sslKeyStore": client.get_instance()._get_jks_key_store_path(),
            "keyStorePassword": client.get_instance()._cert_key,
        }

        return self._connstr + ";".join(
            ["{}={}".format(option[0], option[1]) for option in credentials.items()]
        )
Example #5
0
 def _create_hive_connection(self, feature_store):
     return hive.Connection(
         host=client.get_instance()._host,
         port=9085,
         # database needs to be set every time, 'default' doesn't work in pyhive
         database=feature_store,
         auth="CERTIFICATES",
         truststore=client.get_instance()._get_jks_trust_store_path(),
         keystore=client.get_instance()._get_jks_key_store_path(),
         keystore_password=client.get_instance()._cert_key,
     )
Example #6
0
    def get(self, metadata_instance, name: str = None):
        """Get the tags of a training dataset or feature group.

        Gets all tags if no tag name is specified.

        :param metadata_instance: metadata object of training dataset
            to get the tags for
        :type metadata_instance: TrainingDataset, FeatureGroup
        :param name: tag name
        :type name: str
        :return: dict of tag name/values
        :rtype: dict
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            self._entity_type,
            metadata_instance.id,
            "tags",
        ]

        if name is not None:
            path_params.append(name)

        return {
            tag._name: json.loads(tag._value)
            for tag in tag.Tag.from_response_json(
                _client._send_request("GET", path_params)
            )
        }
    def get(self, name, connector_type):
        """Get storage connector with name and type.

        :param name: name of the storage connector
        :type name: str
        :param connector_type: connector type
        :type connector_type: str
        :return: the storage connector
        :rtype: StorageConnector
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "storageconnectors",
            connector_type,
        ]
        result = [
            conn for conn in _client._send_request("GET", path_params)
            if conn["name"] == name
        ]

        if len(result) == 1:
            return storage_connector.StorageConnector.from_response_json(
                result[0])
        else:
            raise Exception(
                "Could not find the storage connector `{}` with type `{}`.".
                format(name, connector_type))
Example #8
0
    def save_stream_dataframe(
        self,
        feature_group,
        dataframe,
        query_name,
        output_mode,
        await_termination,
        timeout,
        checkpoint_dir,
        write_options,
    ):
        serialized_df = self._online_fg_to_avro(
            feature_group,
            self._encode_complex_features(feature_group, dataframe))

        if query_name is None:
            query_name = "insert_stream_" + feature_group._online_topic_name

        query = (serialized_df.writeStream.outputMode(output_mode).format(
            self.KAFKA_FORMAT).option(
                "checkpointLocation",
                "/Projects/" + client.get_instance()._project_name +
                "/Resources/" + query_name +
                "-checkpoint" if checkpoint_dir is None else checkpoint_dir,
            ).options(**write_options).option(
                "topic", feature_group._online_topic_name).queryName(
                    query_name).start())

        if await_termination:
            query.awaitTermination(timeout)

        return query
Example #9
0
    def update_statistics_config(self, feature_group_instance):
        """Update the statistics configuration of a feature group.

        :param feature_group_instance: metadata object of feature group
        :type feature_group_instance: FeatureGroup
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            feature_group_instance.id,
        ]
        headers = {"content-type": "application/json"}
        query_params = {"updateStatsSettings": True}
        return feature_group_instance.update_from_response_json(
            _client._send_request(
                "PUT",
                path_params,
                query_params,
                headers=headers,
                data=feature_group_instance.json(),
            ), )
    def get_all(self, feature_group_id):
        """Get the validation report attached to a featuregroup.

        :return: validation report
        :rtype: list[dict]
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            feature_group_id,
            "validationreport",
        ]
        headers = {"content-type": "application/json"}
        query_params = {
            "sort_by": "validation_time:desc",
            "offset": 0,
            "fields": "content",
        }

        return ValidationReport.from_response_json(
            _client._send_request("GET",
                                  path_params,
                                  query_params,
                                  headers=headers))
    def post(
        self,
        metadata_instance,
        code,
        entity_id,
        code_type,
        databricks_cluster_id=None,
    ):
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            self._entity_type,
            metadata_instance.id,
            "code",
        ]

        headers = {"content-type": "application/json"}

        query_params = {
            "entityId": entity_id,
            "type": code_type,
            "databricksClusterId": databricks_cluster_id,
        }

        _client._send_request(
            "POST", path_params, query_params, headers=headers, data=code.json()
        )
Example #12
0
def create_mysql_engine(online_conn, external):
    online_options = online_conn.spark_options()
    # Here we are replacing the first part of the string returned by Hopsworks,
    # jdbc:mysql:// with the sqlalchemy one + username and password
    # useSSL and allowPublicKeyRetrieval are not valid properties for the pymysql driver
    # to use SSL we'll have to something like this:
    # ssl_args = {'ssl_ca': ca_path}
    # engine = create_engine("mysql+pymysql://<user>:<pass>@<addr>/<schema>", connect_args=ssl_args)
    if external:
        # This only works with external clients.
        # Hopsworks clients should use the storage connector
        online_options["url"] = re.sub(
            "/[0-9.]+:",
            "/{}:".format(client.get_instance().host),
            online_options["url"],
        )

    sql_alchemy_conn_str = (online_options["url"].replace(
        "jdbc:mysql://",
        "mysql+pymysql://" + online_options["user"] + ":" +
        online_options["password"] + "@",
    ).replace("useSSL=false&", "").replace("?allowPublicKeyRetrieval=true",
                                           ""))

    # default connection pool size kept by engine is 5
    sql_alchemy_engine = create_engine(sql_alchemy_conn_str, pool_recycle=3600)
    return sql_alchemy_engine
    def compute(self, training_dataset_instance, td_app_conf):
        """
        Setup a Hopsworks job to compute the query and write the training dataset

        Args:
            training_dataset_instance (training_dataset): the metadata instance of the training dataset
            app_options ([type]): the configuration for the training dataset job application
        """

        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "trainingdatasets",
            training_dataset_instance.id,
            "compute",
        ]
        headers = {"content-type": "application/json"}
        return job.Job.from_response_json(
            _client._send_request("POST",
                                  path_params,
                                  headers=headers,
                                  data=td_app_conf.json()))
 def get_service(self, service):
     _client = client.get_instance()
     path_params = [
         "services",
         service,
     ]
     return _client._send_request("GET", path_params)["items"][0]
    def get(self, metadata_instance, validation_time=None, commit_time=None):
        """Gets the statistics for a specific commit time for an instance."""
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            self._entity_type,
            metadata_instance.id,
            "validations",
        ]
        headers = {"content-type": "application/json"}
        if validation_time is not None:
            query_params = {
                "filter_by": "validation_time_eq:" + str(validation_time),
            }
        elif commit_time:
            query_params = {
                "filter_by": "commit_time_eq:" + str(commit_time),
            }
        else:
            query_params = None

        return fgv.FeatureGroupValidation.from_response_json(
            _client._send_request("GET", path_params, query_params, headers=headers)
        )
    def get(self, name, version, fg_type):
        """Get the metadata of a feature group with a certain name and version.

        :param name: name of the feature group
        :type name: str
        :param version: version of the feature group
        :type version: int
        :param fg_type: type of the feature group to return
        :type version: string
        :return: feature group metadata object
        :rtype: FeatureGroup
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            name,
        ]
        query_params = {"version": version}
        fg_json = _client._send_request("GET", path_params, query_params)[0]

        if fg_type == self.CACHED:
            return feature_group.FeatureGroup.from_response_json(fg_json)
        else:
            return feature_group.OnDemandFeatureGroup.from_response_json(fg_json)
    def get_last(self, feature_group_id):
        """Gets the latest Validation Report of a featuregroup."""
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            feature_group_id,
            "validationreport",
        ]
        headers = {"content-type": "application/json"}
        query_params = {
            "sort_by": "validation_time:desc",
            "offset": 0,
            "limit": 1,
            "fields": "content",
        }

        return ValidationReport.from_response_json(
            _client._send_request("GET",
                                  path_params,
                                  query_params,
                                  headers=headers))[0]
    def create(self, expectation):
        """Create and Feature Store expectation or Attach it by name to a Feature Group.

        :param expectation: expectation object to be created for a feature store
        :type expectation: `Expectation`
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "expectations",
        ]

        headers = {"content-type": "application/json"}
        print("ExpectationsApi.expectation.to_dict()" +
              str(expectation.to_dict()))
        print("ExpectationsApi.expectation.rules[0].to_dict()" +
              str(expectation.rules[0].to_dict()))
        payload = expectation.json() if expectation else None
        print("ExpectationsApi.expectation.payload" + str(payload))
        _client._send_request("PUT",
                              path_params,
                              headers=headers,
                              data=payload)
Example #19
0
    def get_transformation_fn(self, name, version):
        """
        Retrieve transformation function from backend
        Args:
        name: TransformationFunction name, required
            name of transformation function.
        version: TransformationFunction version, required
            version of transformation function.
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "transformationfunctions",
        ]

        if name:
            query_params = {"name": name}
            if version:
                query_params["version"] = version
            return transformation_function.TransformationFunction.from_response_json(
                _client._send_request("GET", path_params, query_params))
        else:
            return transformation_function.TransformationFunction.from_response_json(
                _client._send_request("GET", path_params))
Example #20
0
    def get(self, name=None, feature_group=None):
        """Get the expectations of a feature store or feature group.

        Gets all feature store expectations if no feature group is specified.
        Gets all feature store or feature group expectations if no name is specified.

        :param name: expectation name
        :type name: str
        :param feature_group: feature group to get the expectations of
        :type feature_group: FeatureGroup
        :return: list of expectations
        :rtype: list of dict
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
        ]

        if feature_group is not None:
            path_params.extend([self._entity_type, feature_group.id, "expectations"])
        else:
            path_params.append("expectations")

        if name:
            path_params.append(name)

        return expectation.Expectation.from_response_json(
            _client._send_request("GET", path_params)
        )
Example #21
0
    def save(self, feature_group_instance):
        """Save feature group metadata to the feature store.

        :param feature_group_instance: metadata object of feature group to be
            saved
        :type feature_group_instance: FeatureGroup
        :return: updated metadata object of the feature group
        :rtype: FeatureGroup
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
        ]
        headers = {"content-type": "application/json"}
        return feature_group_instance.update_from_response_json(
            _client._send_request(
                "POST",
                path_params,
                headers=headers,
                data=feature_group_instance.json(),
            ), )
 def get_path(self, metadata_instance, training_dataset_version=None):
     _client = client.get_instance()
     if isinstance(metadata_instance, feature_view.FeatureView):
         return [
             "project",
             _client._project_id,
             "featurestores",
             self._feature_store_id,
             "featureview",
             metadata_instance.name,
             "version",
             metadata_instance.version,
             "trainingdatasets",
             "version",
             training_dataset_version,
             "statistics",
         ]
     else:
         return [
             "project",
             _client._project_id,
             "featurestores",
             self._feature_store_id,
             self._entity_type,
             metadata_instance.id,
             "statistics",
         ]
Example #23
0
 def init_serving(self, entity, batch, external):
     if external is None:
         external = isinstance(client.get_instance(), client.external.Client)
     # `init_prepared_statement` should be the last because other initialisations
     # has to be done successfully before it is able to fetch feature vectors.
     self.init_transformation(entity)
     self.init_prepared_statement(entity, batch, external)
Example #24
0
    def launch(self, name):
        _client = client.get_instance()
        path_params = [
            "project", _client._project_id, "jobs", name, "executions"
        ]

        _client._send_request("POST", path_params)
Example #25
0
    def get(self, name, connector_type):
        """Get storage connector with name and type.

        :param name: name of the storage connector
        :type name: str
        :param connector_type: connector type
        :type connector_type: str
        :return: the storage connector
        :rtype: StorageConnector
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "storageconnectors",
            connector_type,
            name,
        ]
        query_params = {"temporaryCredentials": True}
        return storage_connector.StorageConnector.from_response_json(
            _client._send_request("GET",
                                  path_params,
                                  query_params=query_params))
 def commit(self, feature_group_instance, feature_group_commit_instance):
     """
     Save feature group commit metadata.
     # Arguments
     feature_group_instance: FeatureGroup, required
         metadata object of feature group.
     feature_group_commit_instance: FeatureGroupCommit, required
         metadata object of feature group commit.
     # Returns
         `FeatureGroupCommit`.
     """
     _client = client.get_instance()
     path_params = [
         "project",
         _client._project_id,
         "featurestores",
         self._feature_store_id,
         "featuregroups",
         feature_group_instance.id,
         "commits",
     ]
     headers = {"content-type": "application/json"}
     return feature_group_commit_instance.update_from_response_json(
         _client._send_request(
             "POST",
             path_params,
             headers=headers,
             data=feature_group_commit_instance.json(),
         ),
     )
Example #27
0
    def add(self, metadata_instance, name, value):
        """Attach a name/value tag to a training dataset or feature group.

        A tag consists of a name/value pair. Tag names are unique identifiers.
        The value of a tag can be any valid json - primitives, arrays or json objects.

        :param metadata_instance: metadata object of the instance to add the
            tag for
        :type metadata_instance: TrainingDataset, FeatureGroup
        :param name: name of the tag to be added
        :type name: str
        :param value: value of the tag to be added
        :type value: str
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            self._entity_type,
            metadata_instance.id,
            "tags",
            name,
        ]
        headers = {"content-type": "application/json"}
        json_value = json.dumps(value)
        _client._send_request("PUT", path_params, headers=headers, data=json_value)
    def get_commit_details(self, feature_group_instance, wallclock_timestamp, limit):
        """
        Get feature group commit metadata.
        # Arguments
        feature_group_instance: FeatureGroup, required
            metadata object of feature group.
        limit: number of commits to retrieve
        wallclock_timestamp: specific point in time.
        # Returns
            `FeatureGroupCommit`.
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            feature_group_instance.id,
            "commits",
        ]
        headers = {"content-type": "application/json"}
        query_params = {"sort_by": "committed_on:desc", "offset": 0, "limit": limit}
        if wallclock_timestamp is not None:
            query_params["filter_by"] = "commited_on_ltoeq:" + str(wallclock_timestamp)

        return feature_group_commit.FeatureGroupCommit.from_response_json(
            _client._send_request("GET", path_params, query_params, headers=headers),
        )
    def ingestion(self, feature_group_instance, ingestion_conf):
        """
        Setup a Hopsworks job for dataframe ingestion
        Args:
        feature_group_instance: FeatureGroup, required
            metadata object of feature group.
        ingestion_conf: the configuration for the ingestion job application
        """

        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            feature_group_instance.id,
            "ingestion",
        ]

        headers = {"content-type": "application/json"}
        return ingestion_job.IngestionJob.from_response_json(
            _client._send_request(
                "POST", path_params, headers=headers, data=ingestion_conf.json()
            ),
        )
    def commit_details(self, feature_group_instance, limit):
        """
        Get feature group commit metadata.
        # Arguments
        feature_group_instance: FeatureGroup, required
            metadata object of feature group.
        limit: number of commits to retrieve
        # Returns
            `FeatureGroupCommit`.
        """
        _client = client.get_instance()
        path_params = [
            "project",
            _client._project_id,
            "featurestores",
            self._feature_store_id,
            "featuregroups",
            feature_group_instance.id,
            "commits",
        ]
        headers = {"content-type": "application/json"}
        query_params = {
            "sort_by": "committed_on:desc",
            "offset": 0,
            "limit": limit
        }

        return feature_group_commit.FeatureGroupCommit.from_response_json(
            _client._send_request("GET",
                                  path_params,
                                  query_params,
                                  headers=headers), )