Ejemplo n.º 1
0
def scrape_workspace(workspace, session, instance_types):
    log.info(f"Scraping workspace {workspace.name}, {workspace.url}")
    result = ScraperRun.empty()
    result.start()
    session.merge(workspace)
    result.num_workspaces += 1

    api = DatabricksAPI(host=workspace.url, token=workspace.token)

    # CLUSTERS
    log.info(f"Started scraping clusters in workspace {workspace.name}.")
    clusters = query_paginated(api.cluster.list_clusters, {}, 'clusters')
    for cluster in clusters:
        scrape_cluster(workspace, cluster, instance_types, session, api,
                       result)
    log.info(f"Finished scraping clusters in workspace {workspace.name}.")

    # JOBS
    log.info(f"Started scraping jobs in workspace {workspace.name}.")
    jobs = query_paginated(api.jobs.list_jobs, {}, 'jobs')
    for job in jobs:
        scrape_jobs(workspace, job, session, api, result)
    log.info(f"Finished scraping jobs in workspace {workspace.name}. "
             f"Jobs scraped: {len(jobs)}")

    # USERS
    log.info(f"Started scraping users in workspace {workspace.name}.")
    scrape_users(workspace, session, result)
    log.info(f"Finished scraping users in workspace {workspace.name}. "
             f"Users scraped: {result.num_users}")

    result.finish(ScraperRun.SUCCESSFUL)
    return result
def upload_artifacts(workspace_url: str, oauth_access_token: str,
                     local_artifacts_path: str, dbfs_dir_path):
    adb_client = DatabricksAPI(host=workspace_url, token=oauth_access_token)
    files_to_upload = []
    if isdir(local_artifacts_path):
        files_to_upload = [
            join(local_artifacts_path, f)
            for f in listdir(local_artifacts_path)
            if isfile(join(local_artifacts_path, f))
        ]
    else:
        files_to_upload = [local_artifacts_path]
    dbfs_folder_exists = False
    try:
        status_rsp = adb_client.dbfs.get_status(path=dbfs_dir_path)
        dbfs_folder_exists = status_rsp["is_dir"]
    except HTTPError as er:
        if er.response.status_code == 404:
            dbfs_folder_exists = False
        else:
            raise er

    if not dbfs_folder_exists:
        print("Creating destination directory on DBFS %s " % dbfs_dir_path)
        adb_client.dbfs.mkdirs(path=dbfs_dir_path)

    for local_file in files_to_upload:
        print("Uploading %s ..." % local_file)
        _upload_multipart(adb_client, local_file, dbfs_dir_path)
def upload_mount_storage_file(databricks_host: str, token: str):
    adb_client = DatabricksAPI(host=databricks_host, token=token)

    adb_client.dbfs.mkdirs(path="/mnt/provision")
    res = adb_client.dbfs.list("/mnt/provision")
    print("Successfully created the provision folder: /mnt/provision")

    try:
        res = adb_client.dbfs.delete("/mnt/provision/mount_dbfs.py")
    except Exception as e:
        pass

    print("Preparing to upload mount_dbfs.py to: /mnt/provision/mount_dbfs.py")
    handle = adb_client.dbfs.create(
        path="/mnt/provision/mount_dbfs.py")['handle']
    # TODO: get the current path
    config_dir = pathlib.Path(__file__).parent.absolute()
    local_file = os.path.join(config_dir, "mount_dbfs.py")
    print("Path for mount_dbfs.py is: ", local_file)

    with open(local_file, "rb") as f:
        while True:
            # A block can be at most 1MB
            block = f.read(1 << 20)
            if not block:
                break
            data = base64.standard_b64encode(block)
            adb_client.dbfs.add_block(handle=handle, data=data.decode("utf-8"))
    # close the handle to finish uploading
    adb_client.dbfs.close(handle=handle)
    print("Upload succeeded: ", local_file)
Ejemplo n.º 4
0
def test__execute__raises_403_http_exception__no_retries_and_raises(mocker):
    retrier = HTTPRetrier(2, 1)

    db = DatabricksAPI(host='HOST', token='TOKEN')
    mock_request = mocker.patch.object(db.client.session, 'request')
    mock_resp = requests.models.Response()
    mock_resp.status_code = 403
    mock_request.return_value = mock_resp

    with pytest.raises(HTTPError):
        return_value = retrier.execute(db.jobs.get_run_output, 1)
    assert retrier._tries == 0
Ejemplo n.º 5
0
    def __init__(self):
        config = cfg.get_auth_config()
        self.min_timeout = MIN_TIMEOUT

        if config is None:
            raise InvalidConfigurationException

        # TODO: remove the dependency with this API, an instead use httpclient/requests
        db = DatabricksAPI(host=config.host, token=config.token)
        self.inner_dbclient = db

        # The retrier uses the recommended defaults
        # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs
        self._retrier = HTTPRetrier()
Ejemplo n.º 6
0
def test__execute__raises_invalid_state_http_exception__retries_twice_and_raises(
        mocker):
    retrier = HTTPRetrier(2, 1)

    db = DatabricksAPI(host='HOST', token='TOKEN')
    mock_request = mocker.patch.object(db.client.session, 'request')
    response_body = " { 'error_code': 'INVALID_STATE', 'message': 'Run result is empty. " + \
                    " There may have been issues while saving or reading results.'} "

    mock_resp = requests.models.Response()
    mock_resp.status_code = 400
    mock_resp.raw = io.BytesIO(bytes(response_body, 'utf-8'))
    mock_request.return_value = mock_resp

    with pytest.raises(HTTPError):
        return_value = retrier.execute(db.jobs.get_run_output, 1)
    assert retrier._tries == 2
Ejemplo n.º 7
0
import inspect

from databricks_api import DatabricksAPI
import databricks_cli


db = DatabricksAPI(host="localhost", token="token")


intro = """databricks-api
==============

|pypi| |pyversions|

.. |pypi| image:: https://img.shields.io/pypi/v/databricks-api.svg
    :target: https://pypi.python.org/pypi/databricks-api

.. |pyversions| image:: https://img.shields.io/pypi/pyversions/databricks-api.svg
    :target: https://pypi.python.org/pypi/databricks-api

*[This documentation is auto-generated]*

This package provides a simplified interface for the Databricks REST API.
The interface is autogenerated on instantiation using the underlying client
library used in the official ``databricks-cli`` python package.

Install using

.. code-block:: bash

    pip install databricks-api
Ejemplo n.º 8
0
 def __init__(self, host, token, workspace_id=None):
     self.host = host
     self.workspace_id = workspace_id
     self.client = DatabricksAPI(host=host, token=token)
Ejemplo n.º 9
0
import sys
import time
from databricks_api import DatabricksAPI
from datetime import datetime, timedelta

print(sys.argv)
token = sys.argv[1]

# Provide a host and token
db = DatabricksAPI(host="eastus.azuredatabricks.net", token=token)

job = db.jobs.run_now(job_id=1)
Ejemplo n.º 10
0
 def jobRunner(self):
     client = DatabricksAPI(host=self.dbParams['instance'],
                            token=self.dbParams['token'])
     resp = client.jobs.run_now(job_id=self.dbParams['job_id'])
     return resp
import sys
import base64
from databricks_api import DatabricksAPI
import re

prod_token = sys.argv[1]
prod_host = sys.argv[2]
notebook_name = sys.argv[3]

db = DatabricksAPI(host=prod_host, token=prod_token)


def import_notebook(file_data, deployment_reference, notebook_full_name):
    # adding disclaimer
    disclaimer = "# Databricks notebook source\n# MAGIC %md\n# MAGIC\n# MAGIC # {0}\n# MAGIC\n# MAGIC > Deployed version as {1}\n# MAGIC\n# MAGIC <em>Please only edit using proper [git flow](https://dev.azure.com/Teck/_git/RACE21%20-%20Trail) as this document will be overwritten during the next deployment. Checkout [dev branch](https://dev.azure.com/Teck/_git/RACE21%20-%20Trail?path=%2F&version=GBdev&_a=contents).</em>\n\n# COMMAND ----------\n".format(
        deployment_reference, notebook_full_name)
    databricks_note = "# Databricks notebook source\n"
    file_data = file_data.replace(databricks_note, disclaimer)
    # encoding for databricks import
    encodedBytes = base64.b64encode(file_data.encode("utf-8"))
    encodedStr = str(encodedBytes, "utf-8")

    db.workspace.import_workspace(notebook_full_name,
                                  format="SOURCE",
                                  language="PYTHON",
                                  content=encodedStr,
                                  overwrite="true")
    print("{} deployed!".format(notebook_full_name))


with open("{}".format(notebook_name)) as file:
Ejemplo n.º 12
0
def provision_databricks_cluster(install_config: InstallConfiguration,
                                 workspace_url: str, oauth_access_token: str,
                                 gdc_sp_secret_value: str,
                                 managed_libraries: list = None,
                                 gdc_sp_secret_name: str = "gdc-service-principal-secret",
                                 gdc_graph_api_sp_secret_name = "graph-api-service-principal-secret",
                                 secret_scope_name: str = "gdc",
                                 adb_cluster_name: str = "default-gdc-cluster",
                                 max_worker: int = 2,
                                 node_type_id: str = "Standard_DS3_v2",
                                 autotermination_minutes: int = 60):
    """

    :param managed_libraries: list of json object in format https://docs.databricks.com/dev-tools/api/latest/libraries.html#example-request
    :param workspace_url:
    :param oauth_access_token:
    :param gdc_sp_secret_value:
    :param gdc_sp_secret_name:
    :param secret_scope_name:
    :param adb_cluster_name:
    :param max_worker:
    :param node_type_id:
    :param autotermination_minutes:
    :return:  dict {
        "cluster_id": cluster_id,
        "api_token": adb_api_token
    }
    """
    print("Provisioning ADB cluster ...")
    assert oauth_access_token is not None
    adb_client = DatabricksAPI(host=workspace_url, token=oauth_access_token)
    scopes = adb_client.secret.list_scopes().get("scopes", [])
    if not any(x for x in scopes if x.get("name") == secret_scope_name):
        adb_client.secret.create_scope(scope=secret_scope_name,
                                       initial_manage_principal="users")

    adb_client.secret.put_secret(scope=secret_scope_name, key=gdc_sp_secret_name, string_value=gdc_sp_secret_value)
    # both databricks jobs use gdc-service service principal to access Graph API and other component
    # but we've introduce two secrets for flexibility even thought they have same value for now
    adb_client.secret.put_secret(scope=secret_scope_name, key=gdc_graph_api_sp_secret_name,
                                 string_value=gdc_sp_secret_value)

    adb_api_token = adb_client.token.create_token(comment="GDC Pipeline API token")
    cluster_id = None
    clusters = adb_client.cluster.list_clusters().get("clusters", [])
    cluster_rsp = list([x for x in clusters if x.get("cluster_name") == adb_cluster_name])
    if not cluster_rsp:
        print("Creating a new cluster %s" % adb_cluster_name)
        cluster_rsp = adb_client.cluster.create_cluster(cluster_name=adb_cluster_name, autoscale={
            "min_workers": 1,
            "max_workers": max_worker
        }, node_type_id=node_type_id, driver_node_type_id=node_type_id,
                                                        autotermination_minutes=autotermination_minutes,
                                                        enable_elastic_disk=True,
                                                        spark_version="6.6.x-scala2.11")
    else:
        print("Cluster %s exists at %s" % (adb_cluster_name, workspace_url))
        cluster_rsp = cluster_rsp[0]

    # capture cluster details as soon as it's available
    install_config.adb_cluster_details = {
        "cluster_id": cluster_rsp['cluster_id'],
        "api_token": adb_api_token
    }
    cluster_id = cluster_rsp['cluster_id']
    if managed_libraries:
        cluster_info = adb_client.cluster.get_cluster(cluster_id=cluster_id)
        cluster_state = cluster_info['state']
        # possible values PENDING, TERMINATED and RUNNING
        if cluster_state == "TERMINATED":
            print("Starting cluster %s " % cluster_id)
            adb_client.cluster.start_cluster(cluster_id=cluster_id)

        cluster_state = "PENDING"
        while cluster_state == "PENDING" or cluster_state == "RESTARTING" or cluster_state == "RESIZING":
            print("Waiting cluster %s " % cluster_id)
            sleep(5)
            cluster_info = adb_client.cluster.get_cluster(cluster_id=cluster_id)
            cluster_state = cluster_info['state']
            print("Cluster is now in state %s " % cluster_state)

        if cluster_state == "TERMINATING" or cluster_state == "TERMINATED" or cluster_state == "ERROR":
            print("Can't install managed libraries, cluster %s is not running" % cluster_id)
            raise RuntimeError("Can't install managed libraries, cluster %s is not running. Check Databricks Workspace Portal for details and  try again later" % cluster_id)
        else:
            try:
                print("Installing managed libraries on cluster %s " % cluster_id)
                install_managed_libraries(adb_client, cluster_id, managed_libraries)
            except BaseException as e:
                print("Failed to install libraries into cluster %s " % cluster_id)
                print(e)

    return {
        "cluster_id": cluster_id,
        "api_token": adb_api_token
    }
Ejemplo n.º 13
0
 def getDbxApi(self) -> DatabricksAPI:
     return DatabricksAPI(host=self.__config.dbx.host,
                          user=self.__config.dbx.user,
                          token=self.__config.dbx.token)
Ejemplo n.º 14
0
def execute_script_mount_storage_script(databricks_host: str, token: str,
                                        cluster_id: str,
                                        storage_account_name: str,
                                        container_name: str, secret_key: str):

    adb_client = DatabricksAPI(host=databricks_host, token=token)
    res = adb_client.dbfs.list("/mnt/")
    print("Waiting 30 seconds before proceeding with the deployment")
    time.sleep(30)

    deployment_succesfull = False
    for i in range(0, 3):
        submit_run_res = adb_client.jobs.submit_run(
            run_name="mount_storage",
            existing_cluster_id=cluster_id,
            spark_python_task={
                "python_file":
                "dbfs:/mnt/provision/mount_dbfs.py",
                "parameters": [
                    f"--account_name####" + storage_account_name,
                    f"--container_name####" + container_name,
                    f"--secret_key_name####" + secret_key,
                    f"--mount_point####/mnt/watercooler"
                ]
            },
            timeout_seconds=3600)
        run_id = submit_run_res["run_id"]

        while True:
            res = adb_client.jobs.get_run(run_id=run_id)
            if "state" in res:
                print("Cluster mount job status is: " + str(res["state"]))
            if res is None:
                print("Cluster mount job completed")
                deployment_succesfull = True
                break
            if "state" in res and "life_cycle_state" in res["state"] and res[
                    "state"]["life_cycle_state"] in ["PENDING", "RUNNING"]:
                time.sleep(5)
                continue
            if "state" in res and "life_cycle_state" in res["state"] and res[
                    "state"]["life_cycle_state"] in [
                        "INTERNAL_ERROR", "FAILED", "TIMED_OUT"
                    ]:
                deployment_succesfull = False
                break
            if "state" in res and "life_cycle_state" in res["state"] and res[
                    "state"]["life_cycle_state"] in [
                        "SUCCESSFUL", "TERMINATED"
                    ]:
                deployment_succesfull = True
                break
            time.sleep(5)
        if deployment_succesfull:
            break
        else:
            print("Retrying: ", str(i), " time. Waiting 10 seconds")
            time.sleep(10)

        time.sleep(30)
    print("Cluster mount job successfully completed:",
          str(deployment_succesfull))
Ejemplo n.º 15
0
 def __init__(self):
     self.db = DatabricksAPI(
         host=settings.DATABRICKS_HOST,
         token=settings.DATABRICKS_TOKEN)
Ejemplo n.º 16
0
    def getSecretToken(spark, dbUserId):
        # get the clusterID and host db-connect is configured for
        clusterId = spark.conf.get("spark.databricks.service.clusterId")
        # service.address has the org in it too, which the API doesn't like
        dbHost = spark.conf.get("spark.databricks.service.address").split(
            "?")[0]
        pat = spark.conf.get('spark.databricks.service.token')

        # get our client for running API requests
        db = DatabricksAPI(host=dbHost, token=pat)

        # upload the notebook code required to fetch the token to our workspace
        # create a "tmp" folder under our user folder
        # import the notebook there
        notebookDir = "/Users/{}/tmp".format(dbUserId)
        notebookPath = notebookDir + "/fetch_apiToken"

        try:
            # Create a tmp folder under the user dir in workspace, if not exists
            db.workspace.mkdirs(notebookDir)

            # Import a one line python notebook to get and return the apiToken
            notebookData = b'# Databricks notebook source\n' + \
                           b'dbutils.notebook.exit' + \
                           b'((".".join(list(dbutils.notebook.entry_point.getDbutils()' + \
                           b'.notebook().getContext().apiToken().get()))))'

            encodedNotebookData = base64.b64encode(notebookData)
            encodedNotebookDataStr = encodedNotebookData.decode("utf-8")

            db.workspace.import_workspace(content=encodedNotebookDataStr,
                                          path=notebookPath,
                                          language='PYTHON',
                                          overwrite=True,
                                          format='SOURCE')

            # Submit a "runs/submit" call for the fetch_apiToken notebook
            notebookTask = {"notebook_path": notebookPath}

            submitResponse = db.jobs.submit_run(
                run_name='fetch_apiToken',
                existing_cluster_id=clusterId,
                notebook_task=notebookTask,
                timeout_seconds=120,
            )

            # Wait for the runs/submit job to finish, so we can get the result
            runId = submitResponse['run_id']

            jobRunning = True
            jobOutput = None

            # check for results every 1 second until it's done
            while jobRunning:
                time.sleep(1)
                output = db.jobs.get_run_output(runId)
                state = output['metadata']['state']['life_cycle_state']
                # print(state)
                if state not in ['RUNNING', 'PENDING', 'TERMINATING']:
                    jobRunning = False
                    result_state = output['metadata']['state']['result_state']
                    if result_state == 'SUCCESS':
                        jobOutput = output['notebook_output']['result']
                    else:
                        jobOutput = "FAILED"

            # return our token
            # it comes with . between each char, remove them
            return jobOutput.replace('.', '')

        except Exception as e:
            print(
                "Exception while importing and executing tmp/fetch_apiToken notebook"
            )
            print(e)
Ejemplo n.º 17
0
        result = dbricks_client.get_run(db=DATABRICKS_API, run_id=run_id)
        return {'result': result}

    @runs_ns.doc('delete run')
    # @runs_ns.response(204, 'run deleted')
    def delete(self, run_id):
        '''Delete a run given its identifier'''
        print(run_id)
        cancel_run_result, delete_run_result = dbricks_client.cancel_and_delete_run(
            db=DATABRICKS_API, run_id=run_id)
        return {'run_id': run_id, 'result': delete_run_result}


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    JOB_CONFIG_PATH = os.path.join(os.path.dirname(__file__), "config",
                                   "job.config.json")

    # Provide a host and token for connecting to DataBricks
    DATABRICKS_HOST = os.getenv("DATABRICKS_HOST")
    DATABRICKS_TOKEN = os.getenv("DATABRICKS_TOKEN")

    PYPI_INDEX_URL = os.getenv("PYPI_INDEX_URL")

    DATABRICKS_API = DatabricksAPI(host=DATABRICKS_HOST,
                                   token=DATABRICKS_TOKEN)

    app.run(host='0.0.0.0')
Ejemplo n.º 18
0
 def __init__(self, host, token):
     self.client = DatabricksAPI(host=host, token=token)
Ejemplo n.º 19
0
def get_db():
    with open("../config.json", "r") as config:
        return DatabricksAPI(**json.load(config))