def uninstall_cli( api_client, cluster_id, all, jar, egg, maven_coordinates, maven_repo, # noqa maven_exclusion, pypi_package, pypi_repo, cran_package, cran_repo): """ Mark libraries on a cluster to be uninstalled. Libraries which are marked to be uninstalled will stay attached until the cluster is restarted. (see `databricks clusters restart -h`). """ if all: library_statuses = _cluster_status(api_client, cluster_id).get( 'library_statuses', []) libraries = [l_status['library'] for l_status in library_statuses] LibrariesApi(api_client).uninstall_libraries(cluster_id, libraries) _uninstall_cli_exit_help(cluster_id) return library = _get_library_from_options(jar, egg, maven_coordinates, maven_repo, maven_exclusion, pypi_package, pypi_repo, cran_package, cran_repo) LibrariesApi(api_client).uninstall_libraries(cluster_id, [library]) _uninstall_cli_exit_help(cluster_id)
def __init__(self, logger, **kwargs): """ :param **kwargs: reserved python word for unlimited parameters keys should only include: token, host :type **kwargs: dict """ self.api_client = ApiClient(**kwargs) self.cluster_client = ClusterApi(self.api_client) self.libraries_client = LibrariesApi(self.api_client) self.logger = logger
def install_cli( api_client, cluster_id, jar, egg, maven_coordinates, maven_repo, maven_exclusion, # noqa pypi_package, pypi_repo, cran_package, cran_repo): """ Install a library ona a cluster. Libraries must be first uploaded to dbfs or s3 (see `dbfs cp -h`). Unlike the API, only one library can be installed for each execution of `databricks libraries install`. Users should only provide one of [--jar, --egg, --maven-coordinates, --pypi-package, --cran-package]. """ library = _get_library_from_options(jar, egg, maven_coordinates, maven_repo, maven_exclusion, pypi_package, pypi_repo, cran_package, cran_repo) LibrariesApi(api_client).install_libraries(cluster_id, [library])
def get_library_state(profile, cluster_id): """Get the state of the library installation on the remote cluster Args: cluster_id (str): Cluster ID host (str): host from databricks cli config for given profile string token (str): token from databricks cli config for given profile stringf Returns: list: list of installation status for each custom library """ try: apiclient = connect(profile) client = LibrariesApi(apiclient) libraries = client.cluster_status(cluster_id) except Exception as ex: print_error(ex) return None if libraries.get("library_statuses", None) is None: return [] else: return [lib["status"] for lib in libraries["library_statuses"]]
def prepare_for_operationalization(cluster_id, api_client, dbfs_path, overwrite, spark_version): """ Installs appropriate versions of several libraries to support operationalization. Args: cluster_id (str): cluster_id representing the cluster to prepare for operationalization api_client (ApiClient): the ApiClient object used to authenticate to the workspace dbfs_path (str): the path on dbfs to upload libraries to overwrite (bool): whether to overwrite existing files on dbfs with new files of the same name spark_version (str): str version indicating which version of spark is installed on the databricks cluster Returns: A dictionary of libraries installed """ print("Preparing for operationlization...") cosmosdb_jar_url = COSMOSDB_JAR_FILE_OPTIONS[spark_version] # download the cosmosdb jar local_jarname = os.path.basename(cosmosdb_jar_url) # only download if you need it: if overwrite or not os.path.exists(local_jarname): print("Downloading {}...".format(cosmosdb_jar_url)) local_jarname, _ = urlretrieve(cosmosdb_jar_url, local_jarname) else: print("File {} already downloaded.".format(local_jarname)) # upload jar to dbfs: upload_path = Path(dbfs_path, local_jarname).as_posix() print("Uploading CosmosDB driver to databricks at {}".format(upload_path)) if dbfs_file_exists(api_client, upload_path) and overwrite: print("Overwriting file at {}".format(upload_path)) DbfsApi(api_client).cp(recursive=False, src=local_jarname, dst=upload_path, overwrite=overwrite) # setup the list of libraries to install: # jar library setup libs2install = [{"jar": upload_path}] # setup libraries to install: libs2install.extend([{"pypi": {"package": i}} for i in PYPI_O16N_LIBS]) print( "Installing jar and pypi libraries required for operationalization...") LibrariesApi(api_client).install_libraries(cluster_id, libs2install) return libs2install
def install_cli( api_client, cluster_id, jar, egg, whl, maven_coordinates, maven_repo, # noqa maven_exclusion, pypi_package, pypi_repo, cran_package, cran_repo): """ Install a library on a cluster. Libraries must be first uploaded to dbfs or s3 (see `dbfs cp -h`). Unlike the API, only one library can be installed for each execution of `databricks libraries install`. Users should only provide one of [--jar, --egg, --whl, --maven-coordinates, --pypi-package, --cran-package]. Installing a whl library on clusters running Databricks Runtime 4.2 or higher effectively runs the pip command against the wheel file directly on driver and executors.The library must satisfy the wheel file name convention. To install multiple wheel files, use the .wheelhouse.zip file that includes all the wheel files with the --whl option. Installing a wheel library on clusters running Databricks Runtime lower than 4.2 just adds the file to the PYTHONPATH variable, without installing the dependencies. More information is available here: https://docs.databricks.com/api/latest/libraries.html#managedlibrariesmanagedlibraryserviceinstalllibraries """ library = _get_library_from_options(jar, egg, whl, maven_coordinates, maven_repo, maven_exclusion, pypi_package, pypi_repo, cran_package, cran_repo) LibrariesApi(api_client).install_libraries(cluster_id, [library])
def _cluster_status(api_client, cluster_id): click.echo( pretty_format(LibrariesApi(api_client).cluster_status(cluster_id)))
def _all_cluster_statuses(config): click.echo(pretty_format(LibrariesApi(config).all_cluster_statuses()))
class ClusterManagement: def __init__(self, logger, **kwargs): """ :param **kwargs: reserved python word for unlimited parameters keys should only include: token, host :type **kwargs: dict """ self.api_client = ApiClient(**kwargs) self.cluster_client = ClusterApi(self.api_client) self.libraries_client = LibrariesApi(self.api_client) self.logger = logger def create_cluster(self, cluster_specs): """function to build/edit cluster and start :param cluster_specs: cluster specs in clusterconf.yaml :type cluster_specs: dict """ # self.cluster_client.get_cluster_by_name("unknown") try: cluster = self.cluster_client.get_cluster_by_name( cluster_specs["cluster_name"]) self.logger.info(f"cluster {cluster['cluster_name']} exists " f"with id {cluster['cluster_id']}") self.logger.debug(cluster_specs) self.logger.debug(cluster) if not cluster_specs.items() <= cluster.items(): self.logger.warning( "cluster spec doesn't match existing cluster") cluster_specs['cluster_id'] = cluster['cluster_id'] self.cluster_client.edit_cluster(cluster_specs) else: self.logger.info("cluster spec matches") except Exception: cluster = self.cluster_client.create_cluster(cluster_specs) self.logger.info(f"the cluster {cluster} is being created") time.sleep(30) cluster_id = cluster['cluster_id'] status = self._cluster_status(cluster_id) while status['state'] in ["RESTARTING", "RESIZING", "TERMINATING"]: self.logger.info( f"waiting for the cluster. status {status['state']}") time.sleep(10) status = self._cluster_status(cluster_id) while status['state'] in ["TERMINATED", "PENDING"]: self.logger.info(f"cluster status {status['state']}") if status['state'] == "TERMINATED": self.logger.info(f"starting cluster, status {status['state']}") self.cluster_client.start_cluster(cluster_id) time.sleep(10) status = self._cluster_status(cluster_id) self.logger.info(f"cluster is up. final status: {status['state']}") return cluster_id def install_cluster_library(self, cluster_id, cluster_libraries): """function to install libraries on cluster :param cluster_id: id of cluster in Databricks to install libs on :type cluster_id: str :param cluster_libraries: clusterlib.yaml :type cluster_libraries: list(dict) """ try: if not isinstance(cluster_libraries, list): raise ValueError( f"cluster_libraries is not a list: {cluster_libraries}") current_libs = self.libraries_client.cluster_status(cluster_id) # parse the libs to match the yaml parsed_currentlibs = [] if current_libs.get("library_statuses"): for lib in current_libs["library_statuses"]: parsed_currentlibs.append(lib["library"]) install_libs = [ x for x in cluster_libraries if x not in parsed_currentlibs ] self.logger.info(f"install libraries: {install_libs}") self.libraries_client.install_libraries(cluster_id, install_libs) uninstall_libs = [ x for x in parsed_currentlibs if x not in cluster_libraries ] self.logger.warning(f"uninstall libraries: {uninstall_libs}") self.libraries_client.uninstall_libraries(cluster_id, uninstall_libs) except Exception as error: self.logger.error(f"install_cluster_library error: {repr(error)}") def _cluster_status(self, cluster_id): """internal method to get cluster status :param cluster_id: id of databricks cluster :type cluster_id: str """ try: status = self.cluster_client.get_cluster(cluster_id) return status except Exception as error: self.logger.error(f"cluster status error: {error}") def delete_unmanaged_clusters(self, cluster_config): """function to delete clusters that are not in clusterconf.yaml :param cluster_config: clusterconf.yaml :type cluster_config: list(dict) """ existing_clusters = self.cluster_client.list_clusters() if existing_clusters.get("clusters"): existing_clusters = [ c for c in existing_clusters.get("clusters") if c["cluster_source"].upper() != "JOB" ] self.logger.debug(existing_clusters) cluster_list = [c["cluster_name"] for c in cluster_config] remove_cluster = [(c["cluster_name"], c["cluster_id"]) for c in existing_clusters if c["cluster_name"] not in cluster_list] self.logger.warning("removing unmanaged clusters:") self.logger.warning(remove_cluster) for c in remove_cluster: self.logger.debug(f"deleting {c[1]}") self.cluster_client.permanent_delete(c[1]) return def main(self, cluster_specs, cluster_libraries): """main method to build/edit clusters and install libs :cluster_spec: cluster spec in clusterconf.yaml :type cluster_spec: dict :param cluster_libraries: clusterlib.yaml :type cluster_libraries: list(dict) """ # self.logger.info("=======================================================") self.logger.info( f"create/update cluster: {cluster_specs['cluster_name']}") cluster_id = self.create_cluster(cluster_specs) self.logger.info("installing libraries") self.install_cluster_library(cluster_id, cluster_libraries)
status["state"]))) sys.exit() # install the library and its dependencies print("Installing the reco_utils module onto databricks cluster {}".format( args.cluster_id)) libs2install = [{"egg": upload_path}] # PYPI dependencies: libs2install.extend([{"pypi": {"package": i}} for i in PYPI_RECO_LIB_DEPS]) # add mmlspark if selected. if args.mmlspark: print("Installing MMLSPARK package...") libs2install.extend([MMLSPARK_INFO]) print(libs2install) LibrariesApi(my_api_client).install_libraries(args.cluster_id, libs2install) # prepare for operationalization if desired: if args.prepare_o16n: prepare_for_operationalization( cluster_id=args.cluster_id, api_client=my_api_client, dbfs_path=args.dbfs_path, overwrite=args.overwrite, spark_version=status["spark_version"][0], ) # restart the cluster for new installation(s) to take effect. print("Restarting databricks cluster {}".format(args.cluster_id)) ClusterApi(my_api_client).restart_cluster(args.cluster_id)