def get_blob_datastore(workspace: Workspace, data_store_name: str,
                       storage_name: str, storage_key: str,
                       container_name: str):
    """
    Returns a reference to a datastore
    Parameters:
      workspace (Workspace): existing AzureML Workspace object
      data_store_name (string): data store name
      storage_name (string): blob storage account name
      storage_key (string): blob storage account key
      container_name (string): container name
    Returns:
        Datastore: a reference to datastore
    """
    try:
        blob_datastore = Datastore.get(workspace, data_store_name)
        print("Found Blob Datastore with name: %s", data_store_name)
    except HttpOperationError:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=workspace,
            datastore_name=data_store_name,
            account_name=storage_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=storage_key)  # Storage account key
    print("Registered blob datastore with name: %s", data_store_name)
    return blob_datastore
Esempio n. 2
0
def create_and_attach_file_storage(cfg, ws):
    if len(cfg.DataReference.localDirectoryFilesList) > 0:
        for ref in cfg.DataReference.localDirectoryFilesList:
            log.info("Attempting to create file share '%s' on storage account '%s'.", ref.remoteFileShare, ref.storageAccountName)
            file_service = FileService(ref.storageAccountName, ref.storageAccountKey)
            exist = file_service.create_share(ref.remoteFileShare, fail_on_exist=False)
            if exist:
                log.info("File Share '%s' on storage account '%s' created.", ref.remoteFileShare, ref.storageAccountName)
            else:
                log.info("File Share '%s' on storage account '%s' already existed.", ref.remoteFileShare, ref.storageAccountName)
            # Get most recent list of datastores linked to current workspace
            datastores = ws.datastores()
            # Validate if share_ds is created
            ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id)
            # Register the DS to the workspace
            if ds:
                if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteFileShare:
                    recreate = False
                else:
                    recreate = True
                    # also remove the existing reference
                    ds.unregister()
            else:
                recreate = True
            if recreate:
                log.info('Registering file share "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteFileShare, ws.name, ref.dataref_id))
                ds = Datastore.register_azure_file_share(workspace = ws,
                                                    datastore_name = ref.dataref_id, 
                                                    file_share_name = ref.remoteFileShare, 
                                                    account_name = ref.storageAccountName, 
                                                    account_key= ref.storageAccountKey,
                                                    overwrite=True,
                                                    )
            else:
                log.info('File share "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteFileShare, ws.name, ref.dataref_id))
Esempio n. 3
0
    def _get_data_references(self, request_id, internal_datastore):
        print(
            'AMLCompute, _get_data_references() called. Request ID: {}'.format(
                request_id))
        # Argument Datastore Name needs to: only contain alphanumeric characters and _.
        request_id_to_use_for_datastore = request_id.replace('-', '_')
        try:
            # setting the overwrite flag to True overwrites any datastore that was created previously with that name

            # internal_datastore stores all user-facing files: list of images, detection results, list of failed images
            # and it so happens that each job also needs the list of images as an input
            internal_datastore_name = 'internal_datastore_{}'.format(
                request_id_to_use_for_datastore)
            internal_account_name = internal_datastore['account_name']
            internal_account_key = internal_datastore['account_key']
            internal_container_name = internal_datastore['container_name']
            internal_datastore = Datastore.register_azure_blob_container(
                self.ws,
                internal_datastore_name,
                internal_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('internal_datastore done')

            # output_datastore stores the output from score.py in each job, which is another container
            # in the same storage account as interl_datastore
            output_datastore_name = 'output_datastore_{}'.format(
                request_id_to_use_for_datastore)
            output_container_name = api_config.AML_CONTAINER
            output_datastore = Datastore.register_azure_blob_container(
                self.ws,
                output_datastore_name,
                output_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('output_datastore done')

        except Exception as e:
            raise RuntimeError(
                'Error in connecting to the datastores for AML Compute: {}'.
                format(str(e)))

        try:
            internal_dir = DataReference(datastore=internal_datastore,
                                         data_reference_name='internal_dir',
                                         mode='mount')

            output_dir = PipelineData(
                'output_{}'.format(request_id_to_use_for_datastore),
                datastore=output_datastore,
                output_mode='mount')
            print('Finished setting up the Data References.')
        except Exception as e:
            raise RuntimeError(
                'Error in creating data references for AML Compute: {}.'.
                format(str(e)))

        return internal_dir, output_dir
Esempio n. 4
0
def register_dataset(path, aml_interface, storage_acct_name, storage_acct_key):
    workspace = aml_interface.workspace
    datastore = Datastore.register_azure_blob_container(
        workspace=workspace,
        datastore_name='prediction',
        container_name='prediction',
        account_name=storage_acct_name,
        account_key=storage_acct_key)

    prediction_datastore = Datastore.get(workspace, 'prediction')
    datastore_path = [(prediction_datastore, path)]
    dataset = Dataset.Tabular.from_delimited_files(path=datastore_path)
    dataset = dataset.register(workspace=aml_interface.workspace,
                               name='Prediction')
Esempio n. 5
0
def upload_files_to_azure(cfg, ws):
    ''' look in the cfg object to file directories and files to upload to AFS and ABS
    input params :
    ws : Description : aml workspace object
    ws : Type : aml workspace object (defined in azureml.core.workspace.Workspace)
    '''
    for ref in cfg.DataReference.localDirectoryBlobList:
        uploadContentBeforeRun = ref.uploadContentBeforeRun
        if uploadContentBeforeRun:
            overwriteOnUpload = ref.overwriteOnUpload
            remoteBlobContainer = ref.remoteBlobContainer
            localDirectoryName  = ref.localDirectoryName
            remoteMountPath = ref.remoteMountPath
            ds = Datastore(workspace = ws, name = remoteBlobContainer)
            ds.upload(src_dir=localDirectoryName, target_path=remoteMountPath, overwrite=overwriteOnUpload, show_progress=True)

    for ref in cfg.DataReference.localDirectoryFilesList:
        uploadContentBeforeRun = ref.uploadContentBeforeRun
        if uploadContentBeforeRun:
            overwriteOnUpload = ref.overwriteOnUpload
            remoteFileShare = ref.remoteFileShare
            localDirectoryName = ref.localDirectoryName
            remoteMountPath = ref.remoteMountPath
            ds = Datastore(workspace = ws, name = remoteFileShare)
            ds.upload(src_dir = localDirectoryName, target_path=remoteMountPath, overwrite=overwriteOnUpload, show_progress=True)
Esempio n. 6
0
            def register_dataset(dataset_name, dataframe):
                dataset_config = next(
                    iter(
                        filter(lambda x: x["name"] == dataset_name,
                               self.output_reg_datasets)))

                datastore = dataset_config.get("datastore") or "default"
                description = dataset_config.get("description")
                tags = dataset_config.get("tags")

                if datastore == "default":
                    ds = ws.get_default_datastore()
                else:
                    ds = Datastore.get(workspace=ws, datastore_name=datastore)

                target_path = f'experiment/{run.experiment.name}/run/{run.number}/out/{dataset_name}'

                default_output_dataset_tags = {
                    "format": self.
                    OUTPUT_FORMAT,  # Dataset.Tabular.register_pandas_dataframe always writes a parquet
                    "experiment": run.experiment.name,
                    "run": run.number
                }

                output_dataset_tags = {**default_output_dataset_tags, **tags}

                Dataset.Tabular.register_pandas_dataframe(
                    dataframe,
                    target=(ds, target_path),
                    name=dataset_name,
                    description=description,
                    tags=output_dataset_tags)
Esempio n. 7
0
def register_data_store(work_space,
                        data_store_name,
                        container_name,
                        blob_account_name,
                        blob_account_key,
                        set_default=False):
    """
    register_data_store - register datastore

    :param str data_store_name: workspace
    :param str container_name: data store name
    :param str blob_account_name: data store name
    :param str blob_account_key: data store name

    :returns: data_store
    :rtype: data store object

    """
    data_store = Datastore.register_azure_blob_container(
        workspace=work_space,
        datastore_name=data_store_name,
        container_name=container_name,
        account_name=blob_account_name,
        account_key=blob_account_key,
        create_if_not_exists=True)
    # Set it to default data store for the AML workspace
    if set_default:
        work_space.set_default_datastore(data_store_name)
    return data_store
Esempio n. 8
0
    def config(ws, blob_datastore_name, account_name, container_name,
               account_key):

        try:
            blob_datastore = Datastore.get(ws, blob_datastore_name)
            print("Found Blob Datastore with name: %s" % blob_datastore_name)
        except HttpOperationError:
            blob_datastore = Datastore.register_azure_blob_container(
                workspace=ws,
                datastore_name=blob_datastore_name,
                account_name=account_name,  # Storage account name
                container_name=container_name,  # Name of Azure blob container
                account_key=account_key)  # Storage account key
            print("Registered blob datastore with name: %s" %
                  blob_datastore_name)

        return blob_datastore
Esempio n. 9
0
def download_model(workspace, path_on_data_store, target_path='.', overwrite=True):
    blob_data_store = Datastore.get_default(workspace)
    number_of_files_successfully_downloaded = blob_data_store.download(target_path=target_path,
                                                                       prefix=path_on_data_store, overwrite=overwrite)
    if number_of_files_successfully_downloaded == 0:
        print('there is no model downloaded')
    else:
        print('model is downloaded to the directory of {}'.format(target_path))
 def mount_datastores(self, datastore_name, container_name, data_ref_path,
                      data_ref_name=None):
     res_mngr = ResourceManager(self.args.spn_id, self.args.spn_secret, self.args.tenant_id)
     self.account_key = res_mngr.get_storage_account_key(
         self.args.account_name, self.args.subscription_id, self.args.resource_group_name)
     ds = Datastore.register_azure_blob_container(
         self.ws, datastore_name, container_name, self.args.account_name,
         account_key=self.account_key, create_if_not_exists=True)
     base_mount = ds.path(path=data_ref_path, data_reference_name=data_ref_name).as_mount()
     return base_mount
Esempio n. 11
0
def createDataReference(workspace, storage_name, storage_key,
                        storage_container_name, data_store_name,
                        data_reference_name):
    '''
        If no present, registers a new azureml.core.datastore.Datastore
        Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that 
        can be used in an Azure ML pipeline step. 

        PARAMS: 
            workspace               : azureml.core.Workspace    : Existing AMLS Workspace
            storage_name            : string                    : Name of the Azure Storage Account
            storage_key             : string                    : Access Key to the Azure Storage Account
            storage_container_name  : string                    : Container name to recieve blobs. Must exist
            data_store_name         : string                    : Name of the registere data store.
            data_reference_name     : string                    : Name of the data reference

        RETURNS: 
            tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference)

    '''
    data_store = None

    try:
        data_store = Datastore.get(workspace, data_store_name)
        print("Found existing data store - ", data_store_name)
    except Exception as ex:
        print("Creating data store - ", data_store_name)

        data_store = Datastore.register_azure_blob_container(
            workspace,
            datastore_name=data_store_name,
            container_name=storage_container_name,
            account_name=storage_name,
            account_key=storage_key,
        )

    if data_store == None:
        raise Exception("Could not create/find data store.")

    return data_store, DataReference(datastore=data_store,
                                     data_reference_name=data_reference_name)
Esempio n. 12
0
    def get_ds_object(ws, name):
        """
        get_ds_object - Get workspace datastore object

        :param str ws: workspace
        :param str name: data store name

        :returns: ws, name
        :rtype: blob object, str

       """
        return Datastore.get(ws, name)
Esempio n. 13
0
def create_and_attach_blob_storage(cfg, ws):
    """ If required, creates the blob storage containers in the datareferences of cfg """
    if len(cfg.DataReference.localDirectoryBlobList) > 0:
        for ref in cfg.DataReference.localDirectoryBlobList:
            log.info("Attempting to create Blob Container '%s' on storage account '%s'.", ref.remoteBlobContainer, ref.storageAccountName)
            blob_service = BlockBlobService(ref.storageAccountName, ref.storageAccountKey)
            exist = blob_service.create_container(ref.remoteBlobContainer, fail_on_exist=False)
            if exist:
                log.info("Blob Container '%s' on storage account '%s' created.", ref.remoteBlobContainer, ref.storageAccountName)
            else:
                log.info("Blob Container '%s' on storage account '%s' already existed.", ref.remoteBlobContainer, ref.storageAccountName)
            # Get most recent list of datastores linked to current workspace
            datastores = ws.datastores()
            # Validate if blob_ds is created
            ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id)
            # If DS exists and isn't mapped to the right place
            if ds:
                if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteBlobContainer:
                    recreate = False
                else:
                    recreate = True
                    # also remove the existing reference
                    ds.unregister()
            else:
                recreate = True
            if recreate:
                log.info('Registering blob "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
                ds = Datastore.register_azure_blob_container(workspace = ws,
                                                    datastore_name = ref.dataref_id, 
                                                    container_name = ref.remoteBlobContainer, 
                                                    account_name = ref.storageAccountName, 
                                                    account_key = ref.storageAccountKey,
                                                    overwrite = True,  # Overwrites the datastore (not the data itself, the object) if it already is part of this workspace
                                                    )
            else:
                log.info('Blob "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
Esempio n. 14
0
def get_or_create_dataset(azure_config: AzureConfig,
                          azure_dataset_id: str) -> Dataset:
    """
    Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created
    and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if
    azure_dataset_id is 'foo', then the 'foo' dataset is pointing to <container_root>/datasets/foo folder.

    WARNING: the behaviour of Dataset.File.from_files, used below, is idiosyncratic. For example,
    if "mydataset" storage has two "foo..." subdirectories each containing
    a file dataset.csv and a directory ABC,

    datastore = Datastore.get(workspace, "mydataset")
    # This dataset has the file(s) in foo-bar01 at top level, e.g. dataset.csv
    ds1 = Dataset.File.from_files([(datastore, "foo-bar01/*")])
    # This dataset has two directories at top level, each with a name matching foo-bar*, and each
    # containing dataset.csv.
    ds2 = Dataset.File.from_files([(datastore, "foo-bar*/*")])
    # This dataset contains a single directory "mydataset" at top level, containing a subdirectory
    # foo-bar01, containing dataset.csv and (part of) ABC.
    ds3 = Dataset.File.from_files([(datastore, "foo-bar01/*"),
                                   (datastore, "foo-bar01/ABC/abc_files/*/*.nii.gz")])

    These behaviours can be verified by calling "ds.download()" on each dataset ds.
    """
    if not azure_config.azureml_datastore:
        raise ValueError(
            "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)"
        )
    logging.info(
        f"Retrieving datastore '{azure_config.azureml_datastore}' from AzureML workspace"
    )
    workspace = azure_config.get_workspace()
    datastore = Datastore.get(workspace, azure_config.azureml_datastore)
    try:
        logging.info(
            f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
        azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
        logging.info("Dataset found.")
    except:
        logging.info(
            f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'"
        )
        # See WARNING above before changing the from_files call!
        azureml_dataset = Dataset.File.from_files([(datastore,
                                                    azure_dataset_id)])
        logging.info("Registering the dataset for future use.")
        azureml_dataset.register(workspace, name=azure_dataset_id)
    return azureml_dataset
Esempio n. 15
0
def create_dataset(ws):  
    kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv')

    data = pd.read_csv(
            './data.csv.zip',
            compression='zip',
            sep='|'
        )

    # Clean dataset 
    data = clean_data(data)

    # Register Dataset in Workspace
    datastore = Datastore(ws)
    name = "Malware Dataset"
    description_text = "Malware DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    return dataset
Esempio n. 16
0
def main():
    """
    Builds the Azure ML pipeline for data engineering and model training.
    """
    databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME']
    training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME']
    build_id = os.getenv('BUILD_BUILDID', 0)

    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=os.environ['AML_WORKSPACE_NAME'],
        subscription_id=os.environ['SUBSCRIPTION_ID'],
        resource_group=os.environ['RESOURCE_GROUP'],
    )
    print(aml_workspace)

    # Generate Databricks credentials, see https://aka.ms/databricks-aad
    dbricks_region = aml_workspace.location
    dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0"

    dbricks_client = databricks_client.create(dbricks_api)
    dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group,
                                workspace_name=databricks_workspace_name)
    dbricks_client.ensure_available()

    # Attach Databricks as Azure ML training compute
    dbricks_compute_name = "databricks"
    dbricks_compute = get_databricks_compute(
        aml_workspace,
        dbricks_compute_name,
    )
    if dbricks_compute is None:
        pat_token = dbricks_client.post(
            'token/create',
            json={"comment": "Azure ML Token generated by Build " + build_id
                  })['token_value']
        dbricks_compute = create_databricks_compute(
            aml_workspace,
            databricks_workspace_name,
            dbricks_compute_name,
            pat_token,
        )

    print("dbricks_compute:")
    print(dbricks_compute)

    # Create Databricks instance pool
    pool_name = "azureml_training"
    instance_pool_id = get_instance_pool(dbricks_client, pool_name)
    if not instance_pool_id:
        dbricks_client.post('instance-pools/create',
                            json={
                                "instance_pool_name":
                                pool_name,
                                "node_type_id":
                                "Standard_D3_v2",
                                "idle_instance_autotermination_minutes":
                                10,
                                "preloaded_spark_versions":
                                [DATABRICKS_RUNTIME_VERSION],
                            })
        instance_pool_id = get_instance_pool(dbricks_client, pool_name)

    notebook_folder = f"/Shared/AzureMLDeployed"
    workspace_datastore = Datastore(aml_workspace, "workspaceblobstore")

    # Create a datastore for the training data container
    credentials, subscription = get_azure_cli_credentials()
    storage_client = StorageManagementClient(credentials, subscription)
    training_storage_keys = storage_client.storage_accounts.list_keys(
        aml_workspace.resource_group, training_data_account_name)
    training_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name="trainingdata",
        container_name="trainingdata",
        account_name=training_data_account_name,
        account_key=training_storage_keys.keys[0].value,
    )

    # FEATURE ENGINEERING STEP (DATABRICKS)
    # Create feature engineering pipeline step

    training_data_input = DataReference(datastore=training_datastore,
                                        path_on_datastore="/",
                                        data_reference_name="training")

    feature_eng_output = PipelineData("feature_engineered",
                                      datastore=workspace_datastore)

    notebook_path = upload_notebook(dbricks_client, notebook_folder,
                                    "code/prepare", "feature_engineering")

    training_dataprep_step = DatabricksStep(
        name="FeatureEngineering",
        inputs=[training_data_input],
        outputs=[feature_eng_output],
        spark_version=DATABRICKS_RUNTIME_VERSION,
        instance_pool_id=instance_pool_id,
        num_workers=3,
        notebook_path=notebook_path,
        run_name="FeatureEngineering",
        compute_target=dbricks_compute,
        allow_reuse=True,
    )

    # You can add Azure ML model training tasks using
    #   feature_eng_output as input.
    # ...

    # Create Azure ML Pipeline
    steps = [training_dataprep_step]

    ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    ml_pipeline.validate()
    published_pipeline = ml_pipeline.publish(
        name="Feature Engineering",
        description="Feature engineering pipeline",
        version=build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")

    # When running in Azure DevOps, set AMLPIPELINE_ID variable
    # for AML Pipeline task in next job
    print("Setting Azure DevOps variable")
    print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]"
          f"{published_pipeline.id}")
Esempio n. 17
0
# this is the URL to the CSV file containing the connected car component descriptions
cardata_url = ('https://quickstartsws9073123377.blob.core.windows.net/'
            'azureml-blobstore-0d1c4218-a5f9-418b-bf55-902b65277b85/'
            'quickstarts/connected-car-data/connected-car_components.csv')

cardata_ds_name = 'connected_car_components'
cardata_ds_description = 'Connected car components data'

embedding_dim = 100                                        
training_samples = 90000                                 
validation_samples = 5000    
max_words = 10000

run = Run.get_context()
ws = run.experiment.workspace
ds = Datastore.get_default(ws)

#-------------------------------------------------------------------
#
# Process GloVe embeddings dataset
#
#-------------------------------------------------------------------

# The GloVe embeddings dataset is static so we will only register it once with the workspace

print("Downloading GloVe embeddings...")

try:
    glove_ds = Dataset.get_by_name(workspace=ws, name=glove_ds_name)
    print('GloVe embeddings dataset already registered.')
except:
Esempio n. 18
0
from azureml.pipeline.core import Schedule
from azureml.core.datastore import Datastore

   datastore = Datastore(workspace="Playground", name="workspaceblobstore")

   schedule = Schedule.create(workspace, name="TestSchedule", pipeline_id="3100e87c-3300-400b-a5a5-470e85a100b3"
                              experiment_name="working version", datastore=datastore,
                              polling_interval=25, path_on_datastore="file/path")
    def __init__(self, ws, service_name, model_name):

        self.__ws = ws
        self.__service_name = service_name
        self.__model = Model(self.__ws, name=model_name)
        self.__datastore = Datastore.get_default(ws)
Esempio n. 20
0
def InitAML(model_name, env, svcpw, interactive=False, create_ws=False):

    print("Environment is  ", env)

    configFilePath = "./environment_setup/Config/config_" + env + ".ini"
    configFile = ConfigParser()
    configFile.read(configFilePath)

    svc_pr_pd = svcpw
    tenant_id = configFile.get('PARAMS', 'tenant_id')
    service_principal_id = configFile.get('PARAMS', 'service_principal_id')
    subscription_id = configFile.get('PARAMS', 'subscription_id')
    resource_group = configFile.get('PARAMS', 'resource_group')
    blobname = configFile.get('PARAMS', 'BlobName')
    workspace_name = configFile.get('PARAMS', 'WorkSpace')
    data_factory_name = configFile.get('PARAMS', 'Data_factory_name')
    location = configFile.get('PARAMS', 'location')

    fp = './' + model_name + '/aml_service/setup.ini'
    conf = ConfigParser()
    conf.read(fp)

    AML_COMPUTE_CLUSTER_NAME = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_NAME')
    AML_COMPUTE_CLUSTER_MIN_NODES = conf.get('PARAMS',
                                             'AML_COMPUTE_CLUSTER_MIN_NODES')
    AML_COMPUTE_CLUSTER_MAX_NODES = conf.get('PARAMS',
                                             'AML_COMPUTE_CLUSTER_MAX_NODES')
    AML_COMPUTE_CLUSTER_SKU = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_SKU')

    if interactive:
        auth = InteractiveLoginAuthentication(tenant_id=tenant_id)

    else:
        auth = ServicePrincipalAuthentication(
            tenant_id=tenant_id,
            service_principal_id=service_principal_id,
            service_principal_password=svc_pr_pd)

    subscription_id = subscription_id
    resource_group = resource_group

    try:
        ws = Workspace(subscription_id=subscription_id,
                       resource_group=resource_group,
                       workspace_name=workspace_name,
                       auth=auth)
        print('Library configuration succeeded')
    except:
        if create_ws:
            ws = Workspace.create(name=workspace_name,
                                  auth=auth,
                                  subscription_id=subscription_id,
                                  resource_group=resource_group,
                                  create_resource_group=False,
                                  location=location)
            print('Workspace not found and is created')
        else:
            print('Workspace not found and not created')

    print('workspace_name:',
          ws.name,
          '\nworkspace_location:',
          ws.location,
          '\nworkspace_resource_group:',
          ws.resource_group,
          sep='\t')

    # choose a name for your cluster
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME",
                                  AML_COMPUTE_CLUSTER_NAME)

    if compute_name in ws.compute_targets:
        compute_target = ws.compute_targets[compute_name]
        if compute_target and type(compute_target) is AmlCompute:
            print('found compute target. just use it. ' + compute_name)
    else:
        print('creating a new compute target...')
        compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES",
                                           AML_COMPUTE_CLUSTER_MIN_NODES)
        compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES",
                                           AML_COMPUTE_CLUSTER_MAX_NODES)
        # This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
        vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU",
                                 AML_COMPUTE_CLUSTER_SKU)
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size=vm_size,
            min_nodes=compute_min_nodes,
            max_nodes=compute_max_nodes)

        # create the cluster
        compute_target = ComputeTarget.create(ws, compute_name,
                                              provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout.
        # if no min node count is provided it will use the scale settings for the cluster
        compute_target.wait_for_completion(show_output=True,
                                           min_node_count=None,
                                           timeout_in_minutes=20)

        # For a more detailed view of current AmlCompute status, use get_status()
        print(compute_target.get_status().serialize())

    try:
        datastore = Datastore(ws, name=blobname)
        print("Found Blob Datastore with name: %s" % datastore)
    except:
        print("No datastore with name: %s" % blobname)
        sys.exit(-1)

    try:
        data_factory = DataFactoryCompute(ws, data_factory_name)
        print('data_factory ', data_factory)
    except ComputeTargetException as e:
        if 'ComputeTargetNotFound' in e.message:
            print('Data factory Compute not found, creating...')
            provisioning_config = DataFactoryCompute.provisioning_configuration(
            )
            data_factory = ComputeTarget.create(ws, data_factory_name,
                                                provisioning_config)
            data_factory.wait_for_completion()
        else:
            print('Data factory Compute not found, Entering Else Section...')
            raise e

    return datastore, compute_target, ws, data_factory
Esempio n. 21
0
        lambda x: x.created_time == max(model.created_time for model in model_list),
        model_list,
    )
)

model = Model(ws, name=production_model.name)
model

service = Model.deploy_from_model(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config)

service.wait_for_deployment(show_output=True)

aci_webservice = {}
aci_webservice["aci_name"] = service.name
aci_webservice["aci_url"] = service.scoring_uri
with open(args.aci_store+"/aci_webservice.json", "w") as outfile:
    json.dump(aci_webservice, outfile)

# Get the blob storage associated with the workspace
pipeline_datastore = Datastore(ws, "datastore_pipeline")

#Upload production model to main blob folder
pipeline_datastore.upload_files([args.aci_store+"/aci_webservice.json"], target_path="webservice"+'/'+args.repo_owner+'/'+args.repo_name, overwrite=True)
    
print("Deployed ACI Webservice: {} \nWebservice Uri: {}".format(service.name, service.scoring_uri)
    
Esempio n. 22
0
import argparse
from pathlib import Path

from azureml.core.datastore import Datastore
from azureml.core.workspace import Workspace

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--subscription-id", type=str)
    parser.add_argument("--resource-group", type=str)
    parser.add_argument("--workspace-name", type=str)
    parser.add_argument("--datastore-name", type=str)
    parser.add_argument("--data-directory", type=str)
    parser.add_argument("--dataset-name", type=str)
    args = parser.parse_args()

    print(args.workspace_name)
    workspace = Workspace(
        subscription_id=args.subscription_id,
        resource_group=args.resource_group,
        workspace_name=args.workspace_name,
    )
    datastore = Datastore.get(workspace, args.datastore_name)
    local_path = Path(args.data_directory)
    for phase in ["train", "val"]:
        local_directory = str(local_path / phase)
        target_path = str(Path(args.dataset_name) / phase)
        datastore.upload(local_directory,
                         target_path=target_path,
                         show_progress=True)
Esempio n. 23
0
    cv_results['n_features'] = X.shape[1]
    cv_results['y_0'] = y.tolist().count(0)
    cv_results['y_1'] = y.tolist().count(1)

    print(cv_results["mean_test_pr_auc"].to_string(index=False))
    run.log(name="mean_test_pr_auc",
            value=cv_results["mean_test_pr_auc"].to_string(index=False))

    if not os.path.isdir(args.train_model):
        os.makedirs(args.train_model, exist_ok=True)

    timestamp_id = datetime.datetime.now()
    time = timestamp_id.strftime("%m-%d-%Y_%H%M")

    model_name = "{}__{}.json".format(args.repo_owner, args.repo_name)
    output_path = os.path.join(args.train_model, model_name)

    with open(output_path, 'w') as outfile:
        cv_results.to_json(outfile, orient='table', index=False)

    # Get the blob storage associated with the workspace
    pipeline_datastore = Datastore(ws, "datastore_pipeline")

    #Upload production model to main blob folder
    pipeline_datastore.upload_files([args.train_model + '/' + model_name],
                                    target_path="train_model" + '/' +
                                    args.repo_owner + '/' + args.repo_name +
                                    '/' + time,
                                    overwrite=True)

print("Model is trained!")
Esempio n. 24
0
from azureml.core import Workspace
ws = Workspace.from_config()

from azureml.core.datastore import Datastore

batchscore_blob = Datastore.register_azure_blob_container(
    ws,
    datastore_name="images_datastore",
    container_name="sampledata",
    account_name="pipelinedata",
    overwrite=True)

def_data_store = ws.get_default_datastore()

from azureml.core.dataset import Dataset
from azureml.pipeline.core import PipelineData

input_images = Dataset.File.from_files(
    (batchscore_blob, "batchscoring/images/"))
label_ds = Dataset.File.from_files((batchscore_blob, "batchscoring/labels/"))
output_dir = PipelineData(name="scores",
                          datastore=def_data_store,
                          output_path_on_compute="batchscoring/results")

input_images = input_images.register(workspace=ws, name="input_images")
label_ds = label_ds.register(workspace=ws, name="label_ds")

from azureml.core.model import Model

model = Model(ws, 'tf-dnn-mnist')
Esempio n. 25
0
    ws = Workspace.create(subscription_id=azureSubscriptionID,
                          resource_group=azureResourceGroup,
                          name=azureMLWorkSpaceName,
                          location=azureMLWorkSpaceLocation)
else:
    ws = Workspace.get(azureMLWorkSpaceName,
                       subscription_id=azureSubscriptionID)

# create or use an existing experiment
exp = Experiment(workspace=ws, name=experiment_name)

# register our existing Azure Blob Container with the labled audio files
ds = Datastore.register_azure_blob_container(
    workspace=ws,
    datastore_name=azureStorageTargetContainer,
    container_name=azureStorageTargetContainer,
    account_name=azureStorgeAccountName,
    account_key=azureStorageKeyName,
    create_if_not_exists=False)

# create a reference where we mount the DataStore to the container instance
dr = DataReferenceConfiguration(datastore_name=ds.name,
                                path_on_compute='data',
                                mode='mount')

# upload any needed files
ws.get_default_datastore().upload(src_dir='.',
                                  target_path='.',
                                  overwrite=True,
                                  show_progress=True)
Esempio n. 26
0
if 'creditcard' not in ws.datasets:

    #Set blobdatastore
    blob_datastore_name = 'MyBlobDatastore'
    account_name = os.getenv(
        "BLOB_ACCOUNTNAME_62",
        "PUT YOUR STORAGE ACCOUNT NAME HERE")  # Storage account name
    container_name = os.getenv(
        "BLOB_CONTAINER_62",
        "PUT YOUR STORAGE CONTAINER NAME HERE")  # Name of Azure blob container
    account_key = os.getenv(
        "BLOB_ACCOUNT_KEY_62",
        "PUT YOUR STORAGE ACCOUNT KEY HERE")  # Storage account key

    try:
        blob_datastore = Datastore.get(ws, blob_datastore_name)
        print("Found Blob Datastore with name: %s" % blob_datastore_name)
    except:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=ws,
            datastore_name=blob_datastore_name,
            account_name=account_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=account_key)  # Storage account key
        print("Registered blob datastore with name: %s" % blob_datastore_name)

    blob_data_ref = DataReference(datastore=blob_datastore,
                                  data_reference_name="blob_test_data",
                                  path_on_datastore="testdata")
    csv_path = (blob_datastore, '/creditcard.csv')
Esempio n. 27
0
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('No compute cluster named {}'.format(cluster_name))
    exit()

curated_env_name = 'Resnet50v15-CPU-cluster'
# pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
pytorch_env = Environment.from_conda_specification(
    name=curated_env_name, file_path='./conda_dependencies.yml')

project_folder = './'
data_path = 'datasets'

datastore = Datastore.get(ws, 'workspaceblobstore')
dataset = Dataset.File.from_files(path=(datastore, data_path))
data_loc = dataset.as_named_input('input').as_mount()

src = ScriptRunConfig(
    source_directory=project_folder,
    # command=['ls'],
    script='train_resnet.py',
    arguments=[
        '--num_epochs',
        16,
        '--batch',
        '32',
        '--shuffle',
        'True',
        '--dataloc',
Esempio n. 28
0
    workspace_name="<your workspace name>"

    if subscription_id.startswith("<"): 
        raise ex
    else: # write and reload from config file
        config = {"Scope": "/subscriptions/" + subscription_id + "/resourceGroups/" + resource_group + "/providers/Microsoft.MachineLearningServices/workspaces/" + workspace_name +"/projects/samples"}
        import json
        import os
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, "w") as fo:
            fo.write(json.dumps(config))
        ws = Workspace.from_config(path=config_path)

from azureml.core.datastore import Datastore

relevance_datastore = Datastore(ws, 'adls_relevance09')

# Retrieve or create the computer target

from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = "Cmpt-112GB-16Cr"
# cluster_name = "Cmpt-512GB-64Cr"
if cluster_name not in ws.compute_targets:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2",
                                                               max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)
    ws = Workspace.get(name=workspace_name,
                       subscription_id=subscription_id,
                       resource_group=resource_group,
                       auth=service_principal)
    return (ws)


if __name__ == '__main__':
    global run
    run = Run.get_context()
    auth_params = get_args()
    ws = get_ws(auth_params)

    datastore_names = list(ws.datastores.keys())
    def_data_store = ws.get_default_datastore()
    def_blob_store = Datastore(ws, "workspaceblobstore")

    data_temp_folder = os.path.join(cwd, "data_temp")
    create_folders([data_temp_folder])

    dataset = {
        'dataset':
        "https://github.com/rouzbeh-afrasiabi/PublicDatasets/raw/master/train.csv.zip"
    }
    word_vectors = {
        "en_vectors_web_lg":
        "https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz"
    }

    toDownload = [dataset, word_vectors]
    download_files(toDownload, data_temp_folder)
def create_DDoS_datasets(ws):  
    dtypes = {
        'Src IP': 'category',
        'Src Port': 'uint16',
        'Dst IP': 'category',
        'Dst Port': 'uint16',
        'Protocol': 'category',
        'Flow Duration': 'uint32',
        'Tot Fwd Pkts': 'uint32',
        'Tot Bwd Pkts': 'uint32',
        'TotLen Fwd Pkts': 'float32',
        'TotLen Bwd Pkts': 'float32',
        'Fwd Pkt Len Max': 'float32',
        'Fwd Pkt Len Min': 'float32',
        'Fwd Pkt Len Mean': 'float32',
        'Fwd Pkt Len Std': 'float32',
        'Bwd Pkt Len Max': 'float32',
        'Bwd Pkt Len Min': 'float32',
        'Bwd Pkt Len Mean': 'float32',
        'Bwd Pkt Len Std': 'float32',
        'Flow Byts/s': 'float32',
        'Flow Pkts/s': 'float32',
        'Flow IAT Mean': 'float32',
        'Flow IAT Std': 'float32',
        'Flow IAT Max': 'float32',
        'Flow IAT Min': 'float32',
        'Fwd IAT Tot': 'float32',
        'Fwd IAT Mean': 'float32',
        'Fwd IAT Std': 'float32',
        'Fwd IAT Max': 'float32',
        'Fwd IAT Min': 'float32',
        'Bwd IAT Tot': 'float32',
        'Bwd IAT Mean': 'float32',
        'Bwd IAT Std': 'float32',
        'Bwd IAT Max': 'float32',
        'Bwd IAT Min': 'float32',
        'Fwd PSH Flags': 'category',
        'Bwd PSH Flags': 'category',
        'Fwd URG Flags': 'category',
        'Bwd URG Flags': 'category',
        'Fwd Header Len': 'uint32',
        'Bwd Header Len': 'uint32',
        'Fwd Pkts/s': 'float32',
        'Bwd Pkts/s': 'float32',
        'Pkt Len Min': 'float32',
        'Pkt Len Max': 'float32',
        'Pkt Len Mean': 'float32',
        'Pkt Len Std': 'float32',
        'Pkt Len Var': 'float32',
        'FIN Flag Cnt': 'category',
        'SYN Flag Cnt': 'category',
        'RST Flag Cnt': 'category',
        'PSH Flag Cnt': 'category',
        'ACK Flag Cnt': 'category',
        'URG Flag Cnt': 'category',
        'CWE Flag Count': 'category',
        'ECE Flag Cnt': 'category',
        'Down/Up Ratio': 'float32',
        'Pkt Size Avg': 'float32',
        'Fwd Seg Size Avg': 'float32',
        'Bwd Seg Size Avg': 'float32',
        'Fwd Byts/b Avg': 'uint32',
        'Fwd Pkts/b Avg': 'uint32',
        'Fwd Blk Rate Avg': 'uint32',
        'Bwd Byts/b Avg': 'uint32',
        'Bwd Pkts/b Avg': 'uint32',
        'Bwd Blk Rate Avg': 'uint32',
        'Subflow Fwd Pkts': 'uint32',
        'Subflow Fwd Byts': 'uint32',
        'Subflow Bwd Pkts': 'uint32',
        'Subflow Bwd Byts': 'uint32',
        'Init Fwd Win Byts': 'uint32',
        'Init Bwd Win Byts': 'uint32',
        'Fwd Act Data Pkts': 'uint32',
        'Fwd Seg Size Min': 'uint32',
        'Active Mean': 'float32',
        'Active Std': 'float32',
        'Active Max': 'float32',
        'Active Min': 'float32',
        'Idle Mean': 'float32',
        'Idle Std': 'float32',
        'Idle Max': 'float32',
        'Idle Min': 'float32',
        'Label': 'category'
    }

    data = pd.read_csv(
            './final_dataset.csv',
            parse_dates=['Timestamp'],
            usecols=[*dtypes.keys(), 'Timestamp'],
            engine='c',
            low_memory=True,
            na_values=np.inf
        )

    # There are over 12 million rows in this orignal dataset. For this project, that much data is taking far too long, so I'm randomly sampling only .5% of the data
    data = data.sample(frac=0.005)

    # Register Base Dataset in Workspace
    datastore = Datastore(ws)
    name = "DDoS Dataset"
    description_text = "DDoS DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    # Clean dataset and register the clean version
    cleaned_data = clean_data(data)
    
    clean_dataset_name = "Clean DDoS Dataset"
    clean_description_text = description_text + " that has been cleaned"
    clean_dataset = TabularDatasetFactory.register_pandas_dataframe(cleaned_data,
                               datastore,
                               clean_dataset_name,
                               description=clean_description_text)