Esempio n. 1
0
    def _get_data_references(self, request_id, internal_datastore):
        print(
            'AMLCompute, _get_data_references() called. Request ID: {}'.format(
                request_id))
        # Argument Datastore Name needs to: only contain alphanumeric characters and _.
        request_id_to_use_for_datastore = request_id.replace('-', '_')
        try:
            # setting the overwrite flag to True overwrites any datastore that was created previously with that name

            # internal_datastore stores all user-facing files: list of images, detection results, list of failed images
            # and it so happens that each job also needs the list of images as an input
            internal_datastore_name = 'internal_datastore_{}'.format(
                request_id_to_use_for_datastore)
            internal_account_name = internal_datastore['account_name']
            internal_account_key = internal_datastore['account_key']
            internal_container_name = internal_datastore['container_name']
            internal_datastore = Datastore.register_azure_blob_container(
                self.ws,
                internal_datastore_name,
                internal_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('internal_datastore done')

            # output_datastore stores the output from score.py in each job, which is another container
            # in the same storage account as interl_datastore
            output_datastore_name = 'output_datastore_{}'.format(
                request_id_to_use_for_datastore)
            output_container_name = api_config.AML_CONTAINER
            output_datastore = Datastore.register_azure_blob_container(
                self.ws,
                output_datastore_name,
                output_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('output_datastore done')

        except Exception as e:
            raise RuntimeError(
                'Error in connecting to the datastores for AML Compute: {}'.
                format(str(e)))

        try:
            internal_dir = DataReference(datastore=internal_datastore,
                                         data_reference_name='internal_dir',
                                         mode='mount')

            output_dir = PipelineData(
                'output_{}'.format(request_id_to_use_for_datastore),
                datastore=output_datastore,
                output_mode='mount')
            print('Finished setting up the Data References.')
        except Exception as e:
            raise RuntimeError(
                'Error in creating data references for AML Compute: {}.'.
                format(str(e)))

        return internal_dir, output_dir
def get_blob_datastore(workspace: Workspace, data_store_name: str,
                       storage_name: str, storage_key: str,
                       container_name: str):
    """
    Returns a reference to a datastore
    Parameters:
      workspace (Workspace): existing AzureML Workspace object
      data_store_name (string): data store name
      storage_name (string): blob storage account name
      storage_key (string): blob storage account key
      container_name (string): container name
    Returns:
        Datastore: a reference to datastore
    """
    try:
        blob_datastore = Datastore.get(workspace, data_store_name)
        print("Found Blob Datastore with name: %s", data_store_name)
    except HttpOperationError:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=workspace,
            datastore_name=data_store_name,
            account_name=storage_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=storage_key)  # Storage account key
    print("Registered blob datastore with name: %s", data_store_name)
    return blob_datastore
Esempio n. 3
0
def register_data_store(work_space,
                        data_store_name,
                        container_name,
                        blob_account_name,
                        blob_account_key,
                        set_default=False):
    """
    register_data_store - register datastore

    :param str data_store_name: workspace
    :param str container_name: data store name
    :param str blob_account_name: data store name
    :param str blob_account_key: data store name

    :returns: data_store
    :rtype: data store object

    """
    data_store = Datastore.register_azure_blob_container(
        workspace=work_space,
        datastore_name=data_store_name,
        container_name=container_name,
        account_name=blob_account_name,
        account_key=blob_account_key,
        create_if_not_exists=True)
    # Set it to default data store for the AML workspace
    if set_default:
        work_space.set_default_datastore(data_store_name)
    return data_store
 def mount_datastores(self, datastore_name, container_name, data_ref_path,
                      data_ref_name=None):
     res_mngr = ResourceManager(self.args.spn_id, self.args.spn_secret, self.args.tenant_id)
     self.account_key = res_mngr.get_storage_account_key(
         self.args.account_name, self.args.subscription_id, self.args.resource_group_name)
     ds = Datastore.register_azure_blob_container(
         self.ws, datastore_name, container_name, self.args.account_name,
         account_key=self.account_key, create_if_not_exists=True)
     base_mount = ds.path(path=data_ref_path, data_reference_name=data_ref_name).as_mount()
     return base_mount
Esempio n. 5
0
def register_dataset(path, aml_interface, storage_acct_name, storage_acct_key):
    workspace = aml_interface.workspace
    datastore = Datastore.register_azure_blob_container(
        workspace=workspace,
        datastore_name='prediction',
        container_name='prediction',
        account_name=storage_acct_name,
        account_key=storage_acct_key)

    prediction_datastore = Datastore.get(workspace, 'prediction')
    datastore_path = [(prediction_datastore, path)]
    dataset = Dataset.Tabular.from_delimited_files(path=datastore_path)
    dataset = dataset.register(workspace=aml_interface.workspace,
                               name='Prediction')
Esempio n. 6
0
    def config(ws, blob_datastore_name, account_name, container_name,
               account_key):

        try:
            blob_datastore = Datastore.get(ws, blob_datastore_name)
            print("Found Blob Datastore with name: %s" % blob_datastore_name)
        except HttpOperationError:
            blob_datastore = Datastore.register_azure_blob_container(
                workspace=ws,
                datastore_name=blob_datastore_name,
                account_name=account_name,  # Storage account name
                container_name=container_name,  # Name of Azure blob container
                account_key=account_key)  # Storage account key
            print("Registered blob datastore with name: %s" %
                  blob_datastore_name)

        return blob_datastore
Esempio n. 7
0
def createDataReference(workspace, storage_name, storage_key,
                        storage_container_name, data_store_name,
                        data_reference_name):
    '''
        If no present, registers a new azureml.core.datastore.Datastore
        Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that 
        can be used in an Azure ML pipeline step. 

        PARAMS: 
            workspace               : azureml.core.Workspace    : Existing AMLS Workspace
            storage_name            : string                    : Name of the Azure Storage Account
            storage_key             : string                    : Access Key to the Azure Storage Account
            storage_container_name  : string                    : Container name to recieve blobs. Must exist
            data_store_name         : string                    : Name of the registere data store.
            data_reference_name     : string                    : Name of the data reference

        RETURNS: 
            tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference)

    '''
    data_store = None

    try:
        data_store = Datastore.get(workspace, data_store_name)
        print("Found existing data store - ", data_store_name)
    except Exception as ex:
        print("Creating data store - ", data_store_name)

        data_store = Datastore.register_azure_blob_container(
            workspace,
            datastore_name=data_store_name,
            container_name=storage_container_name,
            account_name=storage_name,
            account_key=storage_key,
        )

    if data_store == None:
        raise Exception("Could not create/find data store.")

    return data_store, DataReference(datastore=data_store,
                                     data_reference_name=data_reference_name)
Esempio n. 8
0
def create_and_attach_blob_storage(cfg, ws):
    """ If required, creates the blob storage containers in the datareferences of cfg """
    if len(cfg.DataReference.localDirectoryBlobList) > 0:
        for ref in cfg.DataReference.localDirectoryBlobList:
            log.info("Attempting to create Blob Container '%s' on storage account '%s'.", ref.remoteBlobContainer, ref.storageAccountName)
            blob_service = BlockBlobService(ref.storageAccountName, ref.storageAccountKey)
            exist = blob_service.create_container(ref.remoteBlobContainer, fail_on_exist=False)
            if exist:
                log.info("Blob Container '%s' on storage account '%s' created.", ref.remoteBlobContainer, ref.storageAccountName)
            else:
                log.info("Blob Container '%s' on storage account '%s' already existed.", ref.remoteBlobContainer, ref.storageAccountName)
            # Get most recent list of datastores linked to current workspace
            datastores = ws.datastores()
            # Validate if blob_ds is created
            ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id)
            # If DS exists and isn't mapped to the right place
            if ds:
                if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteBlobContainer:
                    recreate = False
                else:
                    recreate = True
                    # also remove the existing reference
                    ds.unregister()
            else:
                recreate = True
            if recreate:
                log.info('Registering blob "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
                ds = Datastore.register_azure_blob_container(workspace = ws,
                                                    datastore_name = ref.dataref_id, 
                                                    container_name = ref.remoteBlobContainer, 
                                                    account_name = ref.storageAccountName, 
                                                    account_key = ref.storageAccountKey,
                                                    overwrite = True,  # Overwrites the datastore (not the data itself, the object) if it already is part of this workspace
                                                    )
            else:
                log.info('Blob "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
Esempio n. 9
0
def main():
    """
    Builds the Azure ML pipeline for data engineering and model training.
    """
    databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME']
    training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME']
    build_id = os.getenv('BUILD_BUILDID', 0)

    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=os.environ['AML_WORKSPACE_NAME'],
        subscription_id=os.environ['SUBSCRIPTION_ID'],
        resource_group=os.environ['RESOURCE_GROUP'],
    )
    print(aml_workspace)

    # Generate Databricks credentials, see https://aka.ms/databricks-aad
    dbricks_region = aml_workspace.location
    dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0"

    dbricks_client = databricks_client.create(dbricks_api)
    dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group,
                                workspace_name=databricks_workspace_name)
    dbricks_client.ensure_available()

    # Attach Databricks as Azure ML training compute
    dbricks_compute_name = "databricks"
    dbricks_compute = get_databricks_compute(
        aml_workspace,
        dbricks_compute_name,
    )
    if dbricks_compute is None:
        pat_token = dbricks_client.post(
            'token/create',
            json={"comment": "Azure ML Token generated by Build " + build_id
                  })['token_value']
        dbricks_compute = create_databricks_compute(
            aml_workspace,
            databricks_workspace_name,
            dbricks_compute_name,
            pat_token,
        )

    print("dbricks_compute:")
    print(dbricks_compute)

    # Create Databricks instance pool
    pool_name = "azureml_training"
    instance_pool_id = get_instance_pool(dbricks_client, pool_name)
    if not instance_pool_id:
        dbricks_client.post('instance-pools/create',
                            json={
                                "instance_pool_name":
                                pool_name,
                                "node_type_id":
                                "Standard_D3_v2",
                                "idle_instance_autotermination_minutes":
                                10,
                                "preloaded_spark_versions":
                                [DATABRICKS_RUNTIME_VERSION],
                            })
        instance_pool_id = get_instance_pool(dbricks_client, pool_name)

    notebook_folder = f"/Shared/AzureMLDeployed"
    workspace_datastore = Datastore(aml_workspace, "workspaceblobstore")

    # Create a datastore for the training data container
    credentials, subscription = get_azure_cli_credentials()
    storage_client = StorageManagementClient(credentials, subscription)
    training_storage_keys = storage_client.storage_accounts.list_keys(
        aml_workspace.resource_group, training_data_account_name)
    training_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name="trainingdata",
        container_name="trainingdata",
        account_name=training_data_account_name,
        account_key=training_storage_keys.keys[0].value,
    )

    # FEATURE ENGINEERING STEP (DATABRICKS)
    # Create feature engineering pipeline step

    training_data_input = DataReference(datastore=training_datastore,
                                        path_on_datastore="/",
                                        data_reference_name="training")

    feature_eng_output = PipelineData("feature_engineered",
                                      datastore=workspace_datastore)

    notebook_path = upload_notebook(dbricks_client, notebook_folder,
                                    "code/prepare", "feature_engineering")

    training_dataprep_step = DatabricksStep(
        name="FeatureEngineering",
        inputs=[training_data_input],
        outputs=[feature_eng_output],
        spark_version=DATABRICKS_RUNTIME_VERSION,
        instance_pool_id=instance_pool_id,
        num_workers=3,
        notebook_path=notebook_path,
        run_name="FeatureEngineering",
        compute_target=dbricks_compute,
        allow_reuse=True,
    )

    # You can add Azure ML model training tasks using
    #   feature_eng_output as input.
    # ...

    # Create Azure ML Pipeline
    steps = [training_dataprep_step]

    ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    ml_pipeline.validate()
    published_pipeline = ml_pipeline.publish(
        name="Feature Engineering",
        description="Feature engineering pipeline",
        version=build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")

    # When running in Azure DevOps, set AMLPIPELINE_ID variable
    # for AML Pipeline task in next job
    print("Setting Azure DevOps variable")
    print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]"
          f"{published_pipeline.id}")
Esempio n. 10
0
        "BLOB_ACCOUNTNAME_62",
        "PUT YOUR STORAGE ACCOUNT NAME HERE")  # Storage account name
    container_name = os.getenv(
        "BLOB_CONTAINER_62",
        "PUT YOUR STORAGE CONTAINER NAME HERE")  # Name of Azure blob container
    account_key = os.getenv(
        "BLOB_ACCOUNT_KEY_62",
        "PUT YOUR STORAGE ACCOUNT KEY HERE")  # Storage account key

    try:
        blob_datastore = Datastore.get(ws, blob_datastore_name)
        print("Found Blob Datastore with name: %s" % blob_datastore_name)
    except:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=ws,
            datastore_name=blob_datastore_name,
            account_name=account_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=account_key)  # Storage account key
        print("Registered blob datastore with name: %s" % blob_datastore_name)

    blob_data_ref = DataReference(datastore=blob_datastore,
                                  data_reference_name="blob_test_data",
                                  path_on_datastore="testdata")
    csv_path = (blob_datastore, '/creditcard.csv')

    try:
        tab_ds = Dataset.Tabular.from_delimited_files(path=csv_path)
        tab_ds = tab_ds.register(workspace=ws, name='creditcard')
    except Exception as ex:
        print(ex)
else:
Esempio n. 11
0
from azureml.core import Workspace
ws = Workspace.from_config()

from azureml.core.datastore import Datastore

batchscore_blob = Datastore.register_azure_blob_container(
    ws,
    datastore_name="images_datastore",
    container_name="sampledata",
    account_name="pipelinedata",
    overwrite=True)

def_data_store = ws.get_default_datastore()

from azureml.core.dataset import Dataset
from azureml.pipeline.core import PipelineData

input_images = Dataset.File.from_files(
    (batchscore_blob, "batchscoring/images/"))
label_ds = Dataset.File.from_files((batchscore_blob, "batchscoring/labels/"))
output_dir = PipelineData(name="scores",
                          datastore=def_data_store,
                          output_path_on_compute="batchscoring/results")

input_images = input_images.register(workspace=ws, name="input_images")
label_ds = label_ds.register(workspace=ws, name="label_ds")

from azureml.core.model import Model

model = Model(ws, 'tf-dnn-mnist')
Esempio n. 12
0
    ws = Workspace.create(subscription_id=azureSubscriptionID,
                          resource_group=azureResourceGroup,
                          name=azureMLWorkSpaceName,
                          location=azureMLWorkSpaceLocation)
else:
    ws = Workspace.get(azureMLWorkSpaceName,
                       subscription_id=azureSubscriptionID)

# create or use an existing experiment
exp = Experiment(workspace=ws, name=experiment_name)

# register our existing Azure Blob Container with the labled audio files
ds = Datastore.register_azure_blob_container(
    workspace=ws,
    datastore_name=azureStorageTargetContainer,
    container_name=azureStorageTargetContainer,
    account_name=azureStorgeAccountName,
    account_key=azureStorageKeyName,
    create_if_not_exists=False)

# create a reference where we mount the DataStore to the container instance
dr = DataReferenceConfiguration(datastore_name=ds.name,
                                path_on_compute='data',
                                mode='mount')

# upload any needed files
ws.get_default_datastore().upload(src_dir='.',
                                  target_path='.',
                                  overwrite=True,
                                  show_progress=True)
Esempio n. 13
0
datastore_name = "bearing_datastore"
dataset_name = "bearing_dataset"
container_name = "bearingdata"

sensor_data = pd.DataFrame()

ws = get_workspace()

try:
    datastore = Datastore.get(ws, datastore_name)
    print("Datastore found: ", datastore_name)
except Exception:
    datastore = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name=datastore_name,
        account_name=os.environ.get('AML_BLOB_ACCOUNT_NAME'),
        container_name=container_name,
        account_key=os.environ.get('AML_BLOB_ACCOUNT_KEY'),
        endpoint="core.chinacloudapi.cn")
    print("Datastore registered: ", datastore_name)

for filename in os.listdir(raw_data_dir):
    data = pd.read_csv(os.path.join(raw_data_dir, filename),
                       names=["c1", "c2", "c3", "c4"],
                       sep='\t')
    data_mean = np.array(data.abs().mean())
    data_mean = pd.DataFrame(data_mean.reshape(1, 4))
    data_mean.index = [pd.to_datetime(filename, format='%Y.%m.%d.%H.%M.%S')]
    sensor_data = sensor_data.append(data_mean)
    print('datapoints appended: ', filename)
Esempio n. 14
0
# COMMAND ----------

blob_datastore_name = 'dsblob'

subscription_id="7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7"
resource_group="ADL_RESOURCE_GROUP", "rgSampleData"

account_name="sasampledata"
tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47"
client_id="2a81532b-016b-4c0e-aa43-bd9b97fbdaba"
client_secret="qlcF52cl2bo0[Nmo@-KuuVlNF[L9Ucs/"
account_key = "p5+k7W6bv9OIrCKpOA+p2Lbu8rrm+6D9eb5Fyv3hqO8j1GqmsYupeztdeaefzG7wScuugVbtGPrJn5BBZCqRsg=="
adlsgen2_datastore = Datastore.register_azure_blob_container(workspace=ws,
                                                             datastore_name=blob_datastore_name,
                                                             account_name=account_name, # ADLS Gen2 account name
                                                             container_name ='nyctaxi', # ADLS Gen2 filesystem
                                                             account_key = account_key)

# COMMAND ----------

print(dsNYCTaxi.datastore_type)

# COMMAND ----------

from azureml.pipeline.core import PipelineParameter,Pipeline, PipelineData
from azureml.data.data_reference import DataReference

# Use the default blob storage
dsNYCTaxi = Datastore.get(ws, "dsblob")
print('Datastore {} will be used'.format(dsNYCTaxi.name))