def setup_databricks( cls, host, project, port=443, region_name="default", secrets_store="parameterstore", cert_folder="hops", hostname_verification=True, trust_store_path=None, api_key_file=None, ): connection = cls( host, port, project, region_name, secrets_store, hostname_verification, trust_store_path, cert_folder, api_key_file, ) dbfs_folder = client.get_instance()._cert_folder_base os.makedirs(os.path.join(dbfs_folder, "scripts"), exist_ok=True) connection._get_clients(dbfs_folder) hive_host = connection._get_hivemetastore_hostname() connection._write_init_script(dbfs_folder) connection._print_instructions( cert_folder, client.get_instance()._cert_folder, hive_host ) return connection
def _get_kafka_config(self, write_options: dict = {}) -> dict: # producer configuration properties # https://docs.confluent.io/platform/current/clients/librdkafka/html/md_CONFIGURATION.html config = { "security.protocol": "SSL", "ssl.ca.location": client.get_instance()._get_ca_chain_path(), "ssl.certificate.location": client.get_instance()._get_client_cert_path(), "ssl.key.location": client.get_instance()._get_client_key_path(), "client.id": socket.gethostname(), **write_options.get("kafka_producer_config", {}), } if isinstance(client.get_instance(), hopsworks.Client) or write_options.get( "internal_kafka", False ): config["bootstrap.servers"] = ",".join( [ endpoint.replace("INTERNAL://", "") for endpoint in self._kafka_api.get_broker_endpoints( externalListeners=False ) ] ) elif isinstance(client.get_instance(), external.Client): config["bootstrap.servers"] = ",".join( [ endpoint.replace("EXTERNAL://", "") for endpoint in self._kafka_api.get_broker_endpoints( externalListeners=True ) ] ) return config
def get_hostname_replaced_url(sub_path: str): """ construct and return an url with public hopsworks hostname and sub path :param self: :param sub_path: url sub-path after base url :return: href url """ href = urljoin(client.get_instance()._base_url, sub_path) url_parsed = client.get_instance().replace_public_host(urlparse(href)) return url_parsed.geturl()
def _get_conn_str(self): credentials = { "sslTrustStore": client.get_instance()._get_jks_trust_store_path(), "trustStorePassword": client.get_instance()._cert_key, "sslKeyStore": client.get_instance()._get_jks_key_store_path(), "keyStorePassword": client.get_instance()._cert_key, } return self._connstr + ";".join( ["{}={}".format(option[0], option[1]) for option in credentials.items()] )
def _create_hive_connection(self, feature_store): return hive.Connection( host=client.get_instance()._host, port=9085, # database needs to be set every time, 'default' doesn't work in pyhive database=feature_store, auth="CERTIFICATES", truststore=client.get_instance()._get_jks_trust_store_path(), keystore=client.get_instance()._get_jks_key_store_path(), keystore_password=client.get_instance()._cert_key, )
def get(self, metadata_instance, name: str = None): """Get the tags of a training dataset or feature group. Gets all tags if no tag name is specified. :param metadata_instance: metadata object of training dataset to get the tags for :type metadata_instance: TrainingDataset, FeatureGroup :param name: tag name :type name: str :return: dict of tag name/values :rtype: dict """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, self._entity_type, metadata_instance.id, "tags", ] if name is not None: path_params.append(name) return { tag._name: json.loads(tag._value) for tag in tag.Tag.from_response_json( _client._send_request("GET", path_params) ) }
def get(self, name, connector_type): """Get storage connector with name and type. :param name: name of the storage connector :type name: str :param connector_type: connector type :type connector_type: str :return: the storage connector :rtype: StorageConnector """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "storageconnectors", connector_type, ] result = [ conn for conn in _client._send_request("GET", path_params) if conn["name"] == name ] if len(result) == 1: return storage_connector.StorageConnector.from_response_json( result[0]) else: raise Exception( "Could not find the storage connector `{}` with type `{}`.". format(name, connector_type))
def save_stream_dataframe( self, feature_group, dataframe, query_name, output_mode, await_termination, timeout, checkpoint_dir, write_options, ): serialized_df = self._online_fg_to_avro( feature_group, self._encode_complex_features(feature_group, dataframe)) if query_name is None: query_name = "insert_stream_" + feature_group._online_topic_name query = (serialized_df.writeStream.outputMode(output_mode).format( self.KAFKA_FORMAT).option( "checkpointLocation", "/Projects/" + client.get_instance()._project_name + "/Resources/" + query_name + "-checkpoint" if checkpoint_dir is None else checkpoint_dir, ).options(**write_options).option( "topic", feature_group._online_topic_name).queryName( query_name).start()) if await_termination: query.awaitTermination(timeout) return query
def update_statistics_config(self, feature_group_instance): """Update the statistics configuration of a feature group. :param feature_group_instance: metadata object of feature group :type feature_group_instance: FeatureGroup """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_instance.id, ] headers = {"content-type": "application/json"} query_params = {"updateStatsSettings": True} return feature_group_instance.update_from_response_json( _client._send_request( "PUT", path_params, query_params, headers=headers, data=feature_group_instance.json(), ), )
def get_all(self, feature_group_id): """Get the validation report attached to a featuregroup. :return: validation report :rtype: list[dict] """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_id, "validationreport", ] headers = {"content-type": "application/json"} query_params = { "sort_by": "validation_time:desc", "offset": 0, "fields": "content", } return ValidationReport.from_response_json( _client._send_request("GET", path_params, query_params, headers=headers))
def post( self, metadata_instance, code, entity_id, code_type, databricks_cluster_id=None, ): _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, self._entity_type, metadata_instance.id, "code", ] headers = {"content-type": "application/json"} query_params = { "entityId": entity_id, "type": code_type, "databricksClusterId": databricks_cluster_id, } _client._send_request( "POST", path_params, query_params, headers=headers, data=code.json() )
def create_mysql_engine(online_conn, external): online_options = online_conn.spark_options() # Here we are replacing the first part of the string returned by Hopsworks, # jdbc:mysql:// with the sqlalchemy one + username and password # useSSL and allowPublicKeyRetrieval are not valid properties for the pymysql driver # to use SSL we'll have to something like this: # ssl_args = {'ssl_ca': ca_path} # engine = create_engine("mysql+pymysql://<user>:<pass>@<addr>/<schema>", connect_args=ssl_args) if external: # This only works with external clients. # Hopsworks clients should use the storage connector online_options["url"] = re.sub( "/[0-9.]+:", "/{}:".format(client.get_instance().host), online_options["url"], ) sql_alchemy_conn_str = (online_options["url"].replace( "jdbc:mysql://", "mysql+pymysql://" + online_options["user"] + ":" + online_options["password"] + "@", ).replace("useSSL=false&", "").replace("?allowPublicKeyRetrieval=true", "")) # default connection pool size kept by engine is 5 sql_alchemy_engine = create_engine(sql_alchemy_conn_str, pool_recycle=3600) return sql_alchemy_engine
def compute(self, training_dataset_instance, td_app_conf): """ Setup a Hopsworks job to compute the query and write the training dataset Args: training_dataset_instance (training_dataset): the metadata instance of the training dataset app_options ([type]): the configuration for the training dataset job application """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "trainingdatasets", training_dataset_instance.id, "compute", ] headers = {"content-type": "application/json"} return job.Job.from_response_json( _client._send_request("POST", path_params, headers=headers, data=td_app_conf.json()))
def get_service(self, service): _client = client.get_instance() path_params = [ "services", service, ] return _client._send_request("GET", path_params)["items"][0]
def get(self, metadata_instance, validation_time=None, commit_time=None): """Gets the statistics for a specific commit time for an instance.""" _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, self._entity_type, metadata_instance.id, "validations", ] headers = {"content-type": "application/json"} if validation_time is not None: query_params = { "filter_by": "validation_time_eq:" + str(validation_time), } elif commit_time: query_params = { "filter_by": "commit_time_eq:" + str(commit_time), } else: query_params = None return fgv.FeatureGroupValidation.from_response_json( _client._send_request("GET", path_params, query_params, headers=headers) )
def get(self, name, version, fg_type): """Get the metadata of a feature group with a certain name and version. :param name: name of the feature group :type name: str :param version: version of the feature group :type version: int :param fg_type: type of the feature group to return :type version: string :return: feature group metadata object :rtype: FeatureGroup """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", name, ] query_params = {"version": version} fg_json = _client._send_request("GET", path_params, query_params)[0] if fg_type == self.CACHED: return feature_group.FeatureGroup.from_response_json(fg_json) else: return feature_group.OnDemandFeatureGroup.from_response_json(fg_json)
def get_last(self, feature_group_id): """Gets the latest Validation Report of a featuregroup.""" _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_id, "validationreport", ] headers = {"content-type": "application/json"} query_params = { "sort_by": "validation_time:desc", "offset": 0, "limit": 1, "fields": "content", } return ValidationReport.from_response_json( _client._send_request("GET", path_params, query_params, headers=headers))[0]
def create(self, expectation): """Create and Feature Store expectation or Attach it by name to a Feature Group. :param expectation: expectation object to be created for a feature store :type expectation: `Expectation` """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "expectations", ] headers = {"content-type": "application/json"} print("ExpectationsApi.expectation.to_dict()" + str(expectation.to_dict())) print("ExpectationsApi.expectation.rules[0].to_dict()" + str(expectation.rules[0].to_dict())) payload = expectation.json() if expectation else None print("ExpectationsApi.expectation.payload" + str(payload)) _client._send_request("PUT", path_params, headers=headers, data=payload)
def get_transformation_fn(self, name, version): """ Retrieve transformation function from backend Args: name: TransformationFunction name, required name of transformation function. version: TransformationFunction version, required version of transformation function. """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "transformationfunctions", ] if name: query_params = {"name": name} if version: query_params["version"] = version return transformation_function.TransformationFunction.from_response_json( _client._send_request("GET", path_params, query_params)) else: return transformation_function.TransformationFunction.from_response_json( _client._send_request("GET", path_params))
def get(self, name=None, feature_group=None): """Get the expectations of a feature store or feature group. Gets all feature store expectations if no feature group is specified. Gets all feature store or feature group expectations if no name is specified. :param name: expectation name :type name: str :param feature_group: feature group to get the expectations of :type feature_group: FeatureGroup :return: list of expectations :rtype: list of dict """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, ] if feature_group is not None: path_params.extend([self._entity_type, feature_group.id, "expectations"]) else: path_params.append("expectations") if name: path_params.append(name) return expectation.Expectation.from_response_json( _client._send_request("GET", path_params) )
def save(self, feature_group_instance): """Save feature group metadata to the feature store. :param feature_group_instance: metadata object of feature group to be saved :type feature_group_instance: FeatureGroup :return: updated metadata object of the feature group :rtype: FeatureGroup """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", ] headers = {"content-type": "application/json"} return feature_group_instance.update_from_response_json( _client._send_request( "POST", path_params, headers=headers, data=feature_group_instance.json(), ), )
def get_path(self, metadata_instance, training_dataset_version=None): _client = client.get_instance() if isinstance(metadata_instance, feature_view.FeatureView): return [ "project", _client._project_id, "featurestores", self._feature_store_id, "featureview", metadata_instance.name, "version", metadata_instance.version, "trainingdatasets", "version", training_dataset_version, "statistics", ] else: return [ "project", _client._project_id, "featurestores", self._feature_store_id, self._entity_type, metadata_instance.id, "statistics", ]
def init_serving(self, entity, batch, external): if external is None: external = isinstance(client.get_instance(), client.external.Client) # `init_prepared_statement` should be the last because other initialisations # has to be done successfully before it is able to fetch feature vectors. self.init_transformation(entity) self.init_prepared_statement(entity, batch, external)
def launch(self, name): _client = client.get_instance() path_params = [ "project", _client._project_id, "jobs", name, "executions" ] _client._send_request("POST", path_params)
def get(self, name, connector_type): """Get storage connector with name and type. :param name: name of the storage connector :type name: str :param connector_type: connector type :type connector_type: str :return: the storage connector :rtype: StorageConnector """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "storageconnectors", connector_type, name, ] query_params = {"temporaryCredentials": True} return storage_connector.StorageConnector.from_response_json( _client._send_request("GET", path_params, query_params=query_params))
def commit(self, feature_group_instance, feature_group_commit_instance): """ Save feature group commit metadata. # Arguments feature_group_instance: FeatureGroup, required metadata object of feature group. feature_group_commit_instance: FeatureGroupCommit, required metadata object of feature group commit. # Returns `FeatureGroupCommit`. """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_instance.id, "commits", ] headers = {"content-type": "application/json"} return feature_group_commit_instance.update_from_response_json( _client._send_request( "POST", path_params, headers=headers, data=feature_group_commit_instance.json(), ), )
def add(self, metadata_instance, name, value): """Attach a name/value tag to a training dataset or feature group. A tag consists of a name/value pair. Tag names are unique identifiers. The value of a tag can be any valid json - primitives, arrays or json objects. :param metadata_instance: metadata object of the instance to add the tag for :type metadata_instance: TrainingDataset, FeatureGroup :param name: name of the tag to be added :type name: str :param value: value of the tag to be added :type value: str """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, self._entity_type, metadata_instance.id, "tags", name, ] headers = {"content-type": "application/json"} json_value = json.dumps(value) _client._send_request("PUT", path_params, headers=headers, data=json_value)
def get_commit_details(self, feature_group_instance, wallclock_timestamp, limit): """ Get feature group commit metadata. # Arguments feature_group_instance: FeatureGroup, required metadata object of feature group. limit: number of commits to retrieve wallclock_timestamp: specific point in time. # Returns `FeatureGroupCommit`. """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_instance.id, "commits", ] headers = {"content-type": "application/json"} query_params = {"sort_by": "committed_on:desc", "offset": 0, "limit": limit} if wallclock_timestamp is not None: query_params["filter_by"] = "commited_on_ltoeq:" + str(wallclock_timestamp) return feature_group_commit.FeatureGroupCommit.from_response_json( _client._send_request("GET", path_params, query_params, headers=headers), )
def ingestion(self, feature_group_instance, ingestion_conf): """ Setup a Hopsworks job for dataframe ingestion Args: feature_group_instance: FeatureGroup, required metadata object of feature group. ingestion_conf: the configuration for the ingestion job application """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_instance.id, "ingestion", ] headers = {"content-type": "application/json"} return ingestion_job.IngestionJob.from_response_json( _client._send_request( "POST", path_params, headers=headers, data=ingestion_conf.json() ), )
def commit_details(self, feature_group_instance, limit): """ Get feature group commit metadata. # Arguments feature_group_instance: FeatureGroup, required metadata object of feature group. limit: number of commits to retrieve # Returns `FeatureGroupCommit`. """ _client = client.get_instance() path_params = [ "project", _client._project_id, "featurestores", self._feature_store_id, "featuregroups", feature_group_instance.id, "commits", ] headers = {"content-type": "application/json"} query_params = { "sort_by": "committed_on:desc", "offset": 0, "limit": limit } return feature_group_commit.FeatureGroupCommit.from_response_json( _client._send_request("GET", path_params, query_params, headers=headers), )