def test_pat_token(): client = databricks_client.create(dbricks_api) client.auth_azuread(workspace_resource_id=resource_id) response = client.post( 'token/create', json={"lifetime_seconds": 60, "comment": "Unit Test Token"} ) pat_client = databricks_client.create(client.host) pat_client.auth_pat_token(response['token_value']) get_clusters_list(pat_client)
def test_credentials(mocker): get_cred = mocker.patch( "azure.common.credentials.get_azure_cli_credentials") client = databricks_client.create(dbricks_api) client.auth_azuread(resource_id, lambda r: credentials.get_token(r).token) get_clusters_list(client) get_cred.assert_not_called()
def test_throws_exception_on_non_json_response(): client = databricks_client.create(dbricks_api) resource_id = ('/subscriptions/%s/resourceGroups/%s/providers/' 'Microsoft.Databricks/workspaces/%s' % (subscription_id, resource_group, 'dummydummy')) client.auth_azuread(workspace_resource_id=resource_id) with pytest.raises(databricks_client.DatabricksPayloadException): get_clusters_list(client)
def test_query_clusters_list(): client = databricks_client.create(dbricks_api) client.auth_azuread(workspace_resource_id=resource_id) response = client.query( requests.get, 'clusters/list', ) assert "clusters" in response
def test_credentials_msrest(mocker): client = databricks_client.create(dbricks_api) def token_callback(resource): return context.acquire_token_with_client_credentials( resource, client_id, client_secret)["accessToken"] client.auth_azuread(resource_id, token_callback) get_clusters_list(client)
def test_ensure_available_when_provisioned(): with requests_mock.Mocker() as m: m.register_uri('GET', 'mock://test.com/instance-pools/list', [ { 'json': {}, 'status_code': 200 }, ]) client = databricks_client.create('mock://test.com') client.ensure_available()
def test_ensure_available_when_timingout(): with requests_mock.Mocker() as m: m.register_uri('GET', 'mock://test.com/xyz', [ { 'json': non_provisioned_response, 'status_code': 400 }, ]) client = databricks_client.create('mock://test.com') try: client.ensure_available(url="xyz", retries=2, delay_seconds=0.01) pytest.fail("Should have thrown DatabricksNotAvailableException") except databricks_client.DatabricksNotAvailableException: pass
def test_workspace_name(): client = databricks_client.create(dbricks_api) client.auth_azuread(resource_group=resource_group, workspace_name=workspace_name) get_clusters_list(client)
def test_rethrows_http_error(): client = databricks_client.create(dbricks_api) client.auth_pat_token("dapi0000000") with pytest.raises(requests.exceptions.HTTPError): get_clusters_list(client)
def test_get_clusters_list(): client = databricks_client.create(dbricks_api) client.auth_azuread(workspace_resource_id=resource_id) get_clusters_list(client)
def main(): """ Builds the Azure ML pipeline for data engineering and model training. """ databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME'] training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME'] build_id = os.getenv('BUILD_BUILDID', 0) # Get Azure machine learning workspace aml_workspace = Workspace.get( name=os.environ['AML_WORKSPACE_NAME'], subscription_id=os.environ['SUBSCRIPTION_ID'], resource_group=os.environ['RESOURCE_GROUP'], ) print(aml_workspace) # Generate Databricks credentials, see https://aka.ms/databricks-aad dbricks_region = aml_workspace.location dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0" dbricks_client = databricks_client.create(dbricks_api) dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group, workspace_name=databricks_workspace_name) dbricks_client.ensure_available() # Attach Databricks as Azure ML training compute dbricks_compute_name = "databricks" dbricks_compute = get_databricks_compute( aml_workspace, dbricks_compute_name, ) if dbricks_compute is None: pat_token = dbricks_client.post( 'token/create', json={"comment": "Azure ML Token generated by Build " + build_id })['token_value'] dbricks_compute = create_databricks_compute( aml_workspace, databricks_workspace_name, dbricks_compute_name, pat_token, ) print("dbricks_compute:") print(dbricks_compute) # Create Databricks instance pool pool_name = "azureml_training" instance_pool_id = get_instance_pool(dbricks_client, pool_name) if not instance_pool_id: dbricks_client.post('instance-pools/create', json={ "instance_pool_name": pool_name, "node_type_id": "Standard_D3_v2", "idle_instance_autotermination_minutes": 10, "preloaded_spark_versions": [DATABRICKS_RUNTIME_VERSION], }) instance_pool_id = get_instance_pool(dbricks_client, pool_name) notebook_folder = f"/Shared/AzureMLDeployed" workspace_datastore = Datastore(aml_workspace, "workspaceblobstore") # Create a datastore for the training data container credentials, subscription = get_azure_cli_credentials() storage_client = StorageManagementClient(credentials, subscription) training_storage_keys = storage_client.storage_accounts.list_keys( aml_workspace.resource_group, training_data_account_name) training_datastore = Datastore.register_azure_blob_container( workspace=aml_workspace, datastore_name="trainingdata", container_name="trainingdata", account_name=training_data_account_name, account_key=training_storage_keys.keys[0].value, ) # FEATURE ENGINEERING STEP (DATABRICKS) # Create feature engineering pipeline step training_data_input = DataReference(datastore=training_datastore, path_on_datastore="/", data_reference_name="training") feature_eng_output = PipelineData("feature_engineered", datastore=workspace_datastore) notebook_path = upload_notebook(dbricks_client, notebook_folder, "code/prepare", "feature_engineering") training_dataprep_step = DatabricksStep( name="FeatureEngineering", inputs=[training_data_input], outputs=[feature_eng_output], spark_version=DATABRICKS_RUNTIME_VERSION, instance_pool_id=instance_pool_id, num_workers=3, notebook_path=notebook_path, run_name="FeatureEngineering", compute_target=dbricks_compute, allow_reuse=True, ) # You can add Azure ML model training tasks using # feature_eng_output as input. # ... # Create Azure ML Pipeline steps = [training_dataprep_step] ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps) ml_pipeline.validate() published_pipeline = ml_pipeline.publish( name="Feature Engineering", description="Feature engineering pipeline", version=build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}") # When running in Azure DevOps, set AMLPIPELINE_ID variable # for AML Pipeline task in next job print("Setting Azure DevOps variable") print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]" f"{published_pipeline.id}")