Exemple #1
0
def test_pat_token():
    client = databricks_client.create(dbricks_api)
    client.auth_azuread(workspace_resource_id=resource_id)
    response = client.post(
        'token/create',
        json={"lifetime_seconds": 60, "comment": "Unit Test Token"}
    )
    pat_client = databricks_client.create(client.host)
    pat_client.auth_pat_token(response['token_value'])
    get_clusters_list(pat_client)
Exemple #2
0
def test_credentials(mocker):
    get_cred = mocker.patch(
        "azure.common.credentials.get_azure_cli_credentials")
    client = databricks_client.create(dbricks_api)

    client.auth_azuread(resource_id, lambda r: credentials.get_token(r).token)
    get_clusters_list(client)
    get_cred.assert_not_called()
Exemple #3
0
def test_throws_exception_on_non_json_response():
    client = databricks_client.create(dbricks_api)
    resource_id = ('/subscriptions/%s/resourceGroups/%s/providers/'
                   'Microsoft.Databricks/workspaces/%s' %
                   (subscription_id, resource_group, 'dummydummy'))
    client.auth_azuread(workspace_resource_id=resource_id)
    with pytest.raises(databricks_client.DatabricksPayloadException):
        get_clusters_list(client)
Exemple #4
0
def test_query_clusters_list():
    client = databricks_client.create(dbricks_api)
    client.auth_azuread(workspace_resource_id=resource_id)
    response = client.query(
        requests.get,
        'clusters/list',
    )
    assert "clusters" in response
Exemple #5
0
def test_credentials_msrest(mocker):
    client = databricks_client.create(dbricks_api)

    def token_callback(resource):
        return context.acquire_token_with_client_credentials(
            resource, client_id, client_secret)["accessToken"]
    client.auth_azuread(resource_id, token_callback)
    get_clusters_list(client)
Exemple #6
0
def test_ensure_available_when_provisioned():
    with requests_mock.Mocker() as m:
        m.register_uri('GET', 'mock://test.com/instance-pools/list', [
            {
                'json': {},
                'status_code': 200
            },
        ])
        client = databricks_client.create('mock://test.com')
        client.ensure_available()
Exemple #7
0
def test_ensure_available_when_timingout():
    with requests_mock.Mocker() as m:
        m.register_uri('GET', 'mock://test.com/xyz', [
            {
                'json': non_provisioned_response,
                'status_code': 400
            },
        ])
        client = databricks_client.create('mock://test.com')
        try:
            client.ensure_available(url="xyz", retries=2, delay_seconds=0.01)
            pytest.fail("Should have thrown DatabricksNotAvailableException")
        except databricks_client.DatabricksNotAvailableException:
            pass
Exemple #8
0
def test_workspace_name():
    client = databricks_client.create(dbricks_api)
    client.auth_azuread(resource_group=resource_group,
                        workspace_name=workspace_name)
    get_clusters_list(client)
Exemple #9
0
def test_rethrows_http_error():
    client = databricks_client.create(dbricks_api)
    client.auth_pat_token("dapi0000000")
    with pytest.raises(requests.exceptions.HTTPError):
        get_clusters_list(client)
Exemple #10
0
def test_get_clusters_list():
    client = databricks_client.create(dbricks_api)
    client.auth_azuread(workspace_resource_id=resource_id)
    get_clusters_list(client)
Exemple #11
0
def main():
    """
    Builds the Azure ML pipeline for data engineering and model training.
    """
    databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME']
    training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME']
    build_id = os.getenv('BUILD_BUILDID', 0)

    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=os.environ['AML_WORKSPACE_NAME'],
        subscription_id=os.environ['SUBSCRIPTION_ID'],
        resource_group=os.environ['RESOURCE_GROUP'],
    )
    print(aml_workspace)

    # Generate Databricks credentials, see https://aka.ms/databricks-aad
    dbricks_region = aml_workspace.location
    dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0"

    dbricks_client = databricks_client.create(dbricks_api)
    dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group,
                                workspace_name=databricks_workspace_name)
    dbricks_client.ensure_available()

    # Attach Databricks as Azure ML training compute
    dbricks_compute_name = "databricks"
    dbricks_compute = get_databricks_compute(
        aml_workspace,
        dbricks_compute_name,
    )
    if dbricks_compute is None:
        pat_token = dbricks_client.post(
            'token/create',
            json={"comment": "Azure ML Token generated by Build " + build_id
                  })['token_value']
        dbricks_compute = create_databricks_compute(
            aml_workspace,
            databricks_workspace_name,
            dbricks_compute_name,
            pat_token,
        )

    print("dbricks_compute:")
    print(dbricks_compute)

    # Create Databricks instance pool
    pool_name = "azureml_training"
    instance_pool_id = get_instance_pool(dbricks_client, pool_name)
    if not instance_pool_id:
        dbricks_client.post('instance-pools/create',
                            json={
                                "instance_pool_name":
                                pool_name,
                                "node_type_id":
                                "Standard_D3_v2",
                                "idle_instance_autotermination_minutes":
                                10,
                                "preloaded_spark_versions":
                                [DATABRICKS_RUNTIME_VERSION],
                            })
        instance_pool_id = get_instance_pool(dbricks_client, pool_name)

    notebook_folder = f"/Shared/AzureMLDeployed"
    workspace_datastore = Datastore(aml_workspace, "workspaceblobstore")

    # Create a datastore for the training data container
    credentials, subscription = get_azure_cli_credentials()
    storage_client = StorageManagementClient(credentials, subscription)
    training_storage_keys = storage_client.storage_accounts.list_keys(
        aml_workspace.resource_group, training_data_account_name)
    training_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name="trainingdata",
        container_name="trainingdata",
        account_name=training_data_account_name,
        account_key=training_storage_keys.keys[0].value,
    )

    # FEATURE ENGINEERING STEP (DATABRICKS)
    # Create feature engineering pipeline step

    training_data_input = DataReference(datastore=training_datastore,
                                        path_on_datastore="/",
                                        data_reference_name="training")

    feature_eng_output = PipelineData("feature_engineered",
                                      datastore=workspace_datastore)

    notebook_path = upload_notebook(dbricks_client, notebook_folder,
                                    "code/prepare", "feature_engineering")

    training_dataprep_step = DatabricksStep(
        name="FeatureEngineering",
        inputs=[training_data_input],
        outputs=[feature_eng_output],
        spark_version=DATABRICKS_RUNTIME_VERSION,
        instance_pool_id=instance_pool_id,
        num_workers=3,
        notebook_path=notebook_path,
        run_name="FeatureEngineering",
        compute_target=dbricks_compute,
        allow_reuse=True,
    )

    # You can add Azure ML model training tasks using
    #   feature_eng_output as input.
    # ...

    # Create Azure ML Pipeline
    steps = [training_dataprep_step]

    ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    ml_pipeline.validate()
    published_pipeline = ml_pipeline.publish(
        name="Feature Engineering",
        description="Feature engineering pipeline",
        version=build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")

    # When running in Azure DevOps, set AMLPIPELINE_ID variable
    # for AML Pipeline task in next job
    print("Setting Azure DevOps variable")
    print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]"
          f"{published_pipeline.id}")