def get_blob_datastore(workspace: Workspace, data_store_name: str, storage_name: str, storage_key: str, container_name: str): """ Returns a reference to a datastore Parameters: workspace (Workspace): existing AzureML Workspace object data_store_name (string): data store name storage_name (string): blob storage account name storage_key (string): blob storage account key container_name (string): container name Returns: Datastore: a reference to datastore """ try: blob_datastore = Datastore.get(workspace, data_store_name) print("Found Blob Datastore with name: %s", data_store_name) except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name=data_store_name, account_name=storage_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=storage_key) # Storage account key print("Registered blob datastore with name: %s", data_store_name) return blob_datastore
def create_and_attach_file_storage(cfg, ws): if len(cfg.DataReference.localDirectoryFilesList) > 0: for ref in cfg.DataReference.localDirectoryFilesList: log.info("Attempting to create file share '%s' on storage account '%s'.", ref.remoteFileShare, ref.storageAccountName) file_service = FileService(ref.storageAccountName, ref.storageAccountKey) exist = file_service.create_share(ref.remoteFileShare, fail_on_exist=False) if exist: log.info("File Share '%s' on storage account '%s' created.", ref.remoteFileShare, ref.storageAccountName) else: log.info("File Share '%s' on storage account '%s' already existed.", ref.remoteFileShare, ref.storageAccountName) # Get most recent list of datastores linked to current workspace datastores = ws.datastores() # Validate if share_ds is created ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id) # Register the DS to the workspace if ds: if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteFileShare: recreate = False else: recreate = True # also remove the existing reference ds.unregister() else: recreate = True if recreate: log.info('Registering file share "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteFileShare, ws.name, ref.dataref_id)) ds = Datastore.register_azure_file_share(workspace = ws, datastore_name = ref.dataref_id, file_share_name = ref.remoteFileShare, account_name = ref.storageAccountName, account_key= ref.storageAccountKey, overwrite=True, ) else: log.info('File share "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteFileShare, ws.name, ref.dataref_id))
def _get_data_references(self, request_id, internal_datastore): print( 'AMLCompute, _get_data_references() called. Request ID: {}'.format( request_id)) # Argument Datastore Name needs to: only contain alphanumeric characters and _. request_id_to_use_for_datastore = request_id.replace('-', '_') try: # setting the overwrite flag to True overwrites any datastore that was created previously with that name # internal_datastore stores all user-facing files: list of images, detection results, list of failed images # and it so happens that each job also needs the list of images as an input internal_datastore_name = 'internal_datastore_{}'.format( request_id_to_use_for_datastore) internal_account_name = internal_datastore['account_name'] internal_account_key = internal_datastore['account_key'] internal_container_name = internal_datastore['container_name'] internal_datastore = Datastore.register_azure_blob_container( self.ws, internal_datastore_name, internal_container_name, internal_account_name, account_key=internal_account_key) print('internal_datastore done') # output_datastore stores the output from score.py in each job, which is another container # in the same storage account as interl_datastore output_datastore_name = 'output_datastore_{}'.format( request_id_to_use_for_datastore) output_container_name = api_config.AML_CONTAINER output_datastore = Datastore.register_azure_blob_container( self.ws, output_datastore_name, output_container_name, internal_account_name, account_key=internal_account_key) print('output_datastore done') except Exception as e: raise RuntimeError( 'Error in connecting to the datastores for AML Compute: {}'. format(str(e))) try: internal_dir = DataReference(datastore=internal_datastore, data_reference_name='internal_dir', mode='mount') output_dir = PipelineData( 'output_{}'.format(request_id_to_use_for_datastore), datastore=output_datastore, output_mode='mount') print('Finished setting up the Data References.') except Exception as e: raise RuntimeError( 'Error in creating data references for AML Compute: {}.'. format(str(e))) return internal_dir, output_dir
def register_dataset(path, aml_interface, storage_acct_name, storage_acct_key): workspace = aml_interface.workspace datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name='prediction', container_name='prediction', account_name=storage_acct_name, account_key=storage_acct_key) prediction_datastore = Datastore.get(workspace, 'prediction') datastore_path = [(prediction_datastore, path)] dataset = Dataset.Tabular.from_delimited_files(path=datastore_path) dataset = dataset.register(workspace=aml_interface.workspace, name='Prediction')
def upload_files_to_azure(cfg, ws): ''' look in the cfg object to file directories and files to upload to AFS and ABS input params : ws : Description : aml workspace object ws : Type : aml workspace object (defined in azureml.core.workspace.Workspace) ''' for ref in cfg.DataReference.localDirectoryBlobList: uploadContentBeforeRun = ref.uploadContentBeforeRun if uploadContentBeforeRun: overwriteOnUpload = ref.overwriteOnUpload remoteBlobContainer = ref.remoteBlobContainer localDirectoryName = ref.localDirectoryName remoteMountPath = ref.remoteMountPath ds = Datastore(workspace = ws, name = remoteBlobContainer) ds.upload(src_dir=localDirectoryName, target_path=remoteMountPath, overwrite=overwriteOnUpload, show_progress=True) for ref in cfg.DataReference.localDirectoryFilesList: uploadContentBeforeRun = ref.uploadContentBeforeRun if uploadContentBeforeRun: overwriteOnUpload = ref.overwriteOnUpload remoteFileShare = ref.remoteFileShare localDirectoryName = ref.localDirectoryName remoteMountPath = ref.remoteMountPath ds = Datastore(workspace = ws, name = remoteFileShare) ds.upload(src_dir = localDirectoryName, target_path=remoteMountPath, overwrite=overwriteOnUpload, show_progress=True)
def register_dataset(dataset_name, dataframe): dataset_config = next( iter( filter(lambda x: x["name"] == dataset_name, self.output_reg_datasets))) datastore = dataset_config.get("datastore") or "default" description = dataset_config.get("description") tags = dataset_config.get("tags") if datastore == "default": ds = ws.get_default_datastore() else: ds = Datastore.get(workspace=ws, datastore_name=datastore) target_path = f'experiment/{run.experiment.name}/run/{run.number}/out/{dataset_name}' default_output_dataset_tags = { "format": self. OUTPUT_FORMAT, # Dataset.Tabular.register_pandas_dataframe always writes a parquet "experiment": run.experiment.name, "run": run.number } output_dataset_tags = {**default_output_dataset_tags, **tags} Dataset.Tabular.register_pandas_dataframe( dataframe, target=(ds, target_path), name=dataset_name, description=description, tags=output_dataset_tags)
def register_data_store(work_space, data_store_name, container_name, blob_account_name, blob_account_key, set_default=False): """ register_data_store - register datastore :param str data_store_name: workspace :param str container_name: data store name :param str blob_account_name: data store name :param str blob_account_key: data store name :returns: data_store :rtype: data store object """ data_store = Datastore.register_azure_blob_container( workspace=work_space, datastore_name=data_store_name, container_name=container_name, account_name=blob_account_name, account_key=blob_account_key, create_if_not_exists=True) # Set it to default data store for the AML workspace if set_default: work_space.set_default_datastore(data_store_name) return data_store
def config(ws, blob_datastore_name, account_name, container_name, account_key): try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) return blob_datastore
def download_model(workspace, path_on_data_store, target_path='.', overwrite=True): blob_data_store = Datastore.get_default(workspace) number_of_files_successfully_downloaded = blob_data_store.download(target_path=target_path, prefix=path_on_data_store, overwrite=overwrite) if number_of_files_successfully_downloaded == 0: print('there is no model downloaded') else: print('model is downloaded to the directory of {}'.format(target_path))
def mount_datastores(self, datastore_name, container_name, data_ref_path, data_ref_name=None): res_mngr = ResourceManager(self.args.spn_id, self.args.spn_secret, self.args.tenant_id) self.account_key = res_mngr.get_storage_account_key( self.args.account_name, self.args.subscription_id, self.args.resource_group_name) ds = Datastore.register_azure_blob_container( self.ws, datastore_name, container_name, self.args.account_name, account_key=self.account_key, create_if_not_exists=True) base_mount = ds.path(path=data_ref_path, data_reference_name=data_ref_name).as_mount() return base_mount
def createDataReference(workspace, storage_name, storage_key, storage_container_name, data_store_name, data_reference_name): ''' If no present, registers a new azureml.core.datastore.Datastore Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that can be used in an Azure ML pipeline step. PARAMS: workspace : azureml.core.Workspace : Existing AMLS Workspace storage_name : string : Name of the Azure Storage Account storage_key : string : Access Key to the Azure Storage Account storage_container_name : string : Container name to recieve blobs. Must exist data_store_name : string : Name of the registere data store. data_reference_name : string : Name of the data reference RETURNS: tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference) ''' data_store = None try: data_store = Datastore.get(workspace, data_store_name) print("Found existing data store - ", data_store_name) except Exception as ex: print("Creating data store - ", data_store_name) data_store = Datastore.register_azure_blob_container( workspace, datastore_name=data_store_name, container_name=storage_container_name, account_name=storage_name, account_key=storage_key, ) if data_store == None: raise Exception("Could not create/find data store.") return data_store, DataReference(datastore=data_store, data_reference_name=data_reference_name)
def get_ds_object(ws, name): """ get_ds_object - Get workspace datastore object :param str ws: workspace :param str name: data store name :returns: ws, name :rtype: blob object, str """ return Datastore.get(ws, name)
def create_and_attach_blob_storage(cfg, ws): """ If required, creates the blob storage containers in the datareferences of cfg """ if len(cfg.DataReference.localDirectoryBlobList) > 0: for ref in cfg.DataReference.localDirectoryBlobList: log.info("Attempting to create Blob Container '%s' on storage account '%s'.", ref.remoteBlobContainer, ref.storageAccountName) blob_service = BlockBlobService(ref.storageAccountName, ref.storageAccountKey) exist = blob_service.create_container(ref.remoteBlobContainer, fail_on_exist=False) if exist: log.info("Blob Container '%s' on storage account '%s' created.", ref.remoteBlobContainer, ref.storageAccountName) else: log.info("Blob Container '%s' on storage account '%s' already existed.", ref.remoteBlobContainer, ref.storageAccountName) # Get most recent list of datastores linked to current workspace datastores = ws.datastores() # Validate if blob_ds is created ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id) # If DS exists and isn't mapped to the right place if ds: if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteBlobContainer: recreate = False else: recreate = True # also remove the existing reference ds.unregister() else: recreate = True if recreate: log.info('Registering blob "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id)) ds = Datastore.register_azure_blob_container(workspace = ws, datastore_name = ref.dataref_id, container_name = ref.remoteBlobContainer, account_name = ref.storageAccountName, account_key = ref.storageAccountKey, overwrite = True, # Overwrites the datastore (not the data itself, the object) if it already is part of this workspace ) else: log.info('Blob "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
def get_or_create_dataset(azure_config: AzureConfig, azure_dataset_id: str) -> Dataset: """ Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if azure_dataset_id is 'foo', then the 'foo' dataset is pointing to <container_root>/datasets/foo folder. WARNING: the behaviour of Dataset.File.from_files, used below, is idiosyncratic. For example, if "mydataset" storage has two "foo..." subdirectories each containing a file dataset.csv and a directory ABC, datastore = Datastore.get(workspace, "mydataset") # This dataset has the file(s) in foo-bar01 at top level, e.g. dataset.csv ds1 = Dataset.File.from_files([(datastore, "foo-bar01/*")]) # This dataset has two directories at top level, each with a name matching foo-bar*, and each # containing dataset.csv. ds2 = Dataset.File.from_files([(datastore, "foo-bar*/*")]) # This dataset contains a single directory "mydataset" at top level, containing a subdirectory # foo-bar01, containing dataset.csv and (part of) ABC. ds3 = Dataset.File.from_files([(datastore, "foo-bar01/*"), (datastore, "foo-bar01/ABC/abc_files/*/*.nii.gz")]) These behaviours can be verified by calling "ds.download()" on each dataset ds. """ if not azure_config.azureml_datastore: raise ValueError( "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)" ) logging.info( f"Retrieving datastore '{azure_config.azureml_datastore}' from AzureML workspace" ) workspace = azure_config.get_workspace() datastore = Datastore.get(workspace, azure_config.azureml_datastore) try: logging.info( f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'") azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id) logging.info("Dataset found.") except: logging.info( f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'" ) # See WARNING above before changing the from_files call! azureml_dataset = Dataset.File.from_files([(datastore, azure_dataset_id)]) logging.info("Registering the dataset for future use.") azureml_dataset.register(workspace, name=azure_dataset_id) return azureml_dataset
def create_dataset(ws): kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv') data = pd.read_csv( './data.csv.zip', compression='zip', sep='|' ) # Clean dataset data = clean_data(data) # Register Dataset in Workspace datastore = Datastore(ws) name = "Malware Dataset" description_text = "Malware DataSet for Udacity Capstone Project" dataset = TabularDatasetFactory.register_pandas_dataframe(data, datastore, name, description=description_text) return dataset
def main(): """ Builds the Azure ML pipeline for data engineering and model training. """ databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME'] training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME'] build_id = os.getenv('BUILD_BUILDID', 0) # Get Azure machine learning workspace aml_workspace = Workspace.get( name=os.environ['AML_WORKSPACE_NAME'], subscription_id=os.environ['SUBSCRIPTION_ID'], resource_group=os.environ['RESOURCE_GROUP'], ) print(aml_workspace) # Generate Databricks credentials, see https://aka.ms/databricks-aad dbricks_region = aml_workspace.location dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0" dbricks_client = databricks_client.create(dbricks_api) dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group, workspace_name=databricks_workspace_name) dbricks_client.ensure_available() # Attach Databricks as Azure ML training compute dbricks_compute_name = "databricks" dbricks_compute = get_databricks_compute( aml_workspace, dbricks_compute_name, ) if dbricks_compute is None: pat_token = dbricks_client.post( 'token/create', json={"comment": "Azure ML Token generated by Build " + build_id })['token_value'] dbricks_compute = create_databricks_compute( aml_workspace, databricks_workspace_name, dbricks_compute_name, pat_token, ) print("dbricks_compute:") print(dbricks_compute) # Create Databricks instance pool pool_name = "azureml_training" instance_pool_id = get_instance_pool(dbricks_client, pool_name) if not instance_pool_id: dbricks_client.post('instance-pools/create', json={ "instance_pool_name": pool_name, "node_type_id": "Standard_D3_v2", "idle_instance_autotermination_minutes": 10, "preloaded_spark_versions": [DATABRICKS_RUNTIME_VERSION], }) instance_pool_id = get_instance_pool(dbricks_client, pool_name) notebook_folder = f"/Shared/AzureMLDeployed" workspace_datastore = Datastore(aml_workspace, "workspaceblobstore") # Create a datastore for the training data container credentials, subscription = get_azure_cli_credentials() storage_client = StorageManagementClient(credentials, subscription) training_storage_keys = storage_client.storage_accounts.list_keys( aml_workspace.resource_group, training_data_account_name) training_datastore = Datastore.register_azure_blob_container( workspace=aml_workspace, datastore_name="trainingdata", container_name="trainingdata", account_name=training_data_account_name, account_key=training_storage_keys.keys[0].value, ) # FEATURE ENGINEERING STEP (DATABRICKS) # Create feature engineering pipeline step training_data_input = DataReference(datastore=training_datastore, path_on_datastore="/", data_reference_name="training") feature_eng_output = PipelineData("feature_engineered", datastore=workspace_datastore) notebook_path = upload_notebook(dbricks_client, notebook_folder, "code/prepare", "feature_engineering") training_dataprep_step = DatabricksStep( name="FeatureEngineering", inputs=[training_data_input], outputs=[feature_eng_output], spark_version=DATABRICKS_RUNTIME_VERSION, instance_pool_id=instance_pool_id, num_workers=3, notebook_path=notebook_path, run_name="FeatureEngineering", compute_target=dbricks_compute, allow_reuse=True, ) # You can add Azure ML model training tasks using # feature_eng_output as input. # ... # Create Azure ML Pipeline steps = [training_dataprep_step] ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps) ml_pipeline.validate() published_pipeline = ml_pipeline.publish( name="Feature Engineering", description="Feature engineering pipeline", version=build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}") # When running in Azure DevOps, set AMLPIPELINE_ID variable # for AML Pipeline task in next job print("Setting Azure DevOps variable") print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]" f"{published_pipeline.id}")
# this is the URL to the CSV file containing the connected car component descriptions cardata_url = ('https://quickstartsws9073123377.blob.core.windows.net/' 'azureml-blobstore-0d1c4218-a5f9-418b-bf55-902b65277b85/' 'quickstarts/connected-car-data/connected-car_components.csv') cardata_ds_name = 'connected_car_components' cardata_ds_description = 'Connected car components data' embedding_dim = 100 training_samples = 90000 validation_samples = 5000 max_words = 10000 run = Run.get_context() ws = run.experiment.workspace ds = Datastore.get_default(ws) #------------------------------------------------------------------- # # Process GloVe embeddings dataset # #------------------------------------------------------------------- # The GloVe embeddings dataset is static so we will only register it once with the workspace print("Downloading GloVe embeddings...") try: glove_ds = Dataset.get_by_name(workspace=ws, name=glove_ds_name) print('GloVe embeddings dataset already registered.') except:
from azureml.pipeline.core import Schedule from azureml.core.datastore import Datastore datastore = Datastore(workspace="Playground", name="workspaceblobstore") schedule = Schedule.create(workspace, name="TestSchedule", pipeline_id="3100e87c-3300-400b-a5a5-470e85a100b3" experiment_name="working version", datastore=datastore, polling_interval=25, path_on_datastore="file/path")
def __init__(self, ws, service_name, model_name): self.__ws = ws self.__service_name = service_name self.__model = Model(self.__ws, name=model_name) self.__datastore = Datastore.get_default(ws)
def InitAML(model_name, env, svcpw, interactive=False, create_ws=False): print("Environment is ", env) configFilePath = "./environment_setup/Config/config_" + env + ".ini" configFile = ConfigParser() configFile.read(configFilePath) svc_pr_pd = svcpw tenant_id = configFile.get('PARAMS', 'tenant_id') service_principal_id = configFile.get('PARAMS', 'service_principal_id') subscription_id = configFile.get('PARAMS', 'subscription_id') resource_group = configFile.get('PARAMS', 'resource_group') blobname = configFile.get('PARAMS', 'BlobName') workspace_name = configFile.get('PARAMS', 'WorkSpace') data_factory_name = configFile.get('PARAMS', 'Data_factory_name') location = configFile.get('PARAMS', 'location') fp = './' + model_name + '/aml_service/setup.ini' conf = ConfigParser() conf.read(fp) AML_COMPUTE_CLUSTER_NAME = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_NAME') AML_COMPUTE_CLUSTER_MIN_NODES = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_MIN_NODES') AML_COMPUTE_CLUSTER_MAX_NODES = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_MAX_NODES') AML_COMPUTE_CLUSTER_SKU = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_SKU') if interactive: auth = InteractiveLoginAuthentication(tenant_id=tenant_id) else: auth = ServicePrincipalAuthentication( tenant_id=tenant_id, service_principal_id=service_principal_id, service_principal_password=svc_pr_pd) subscription_id = subscription_id resource_group = resource_group try: ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name, auth=auth) print('Library configuration succeeded') except: if create_ws: ws = Workspace.create(name=workspace_name, auth=auth, subscription_id=subscription_id, resource_group=resource_group, create_resource_group=False, location=location) print('Workspace not found and is created') else: print('Workspace not found and not created') print('workspace_name:', ws.name, '\nworkspace_location:', ws.location, '\nworkspace_resource_group:', ws.resource_group, sep='\t') # choose a name for your cluster compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", AML_COMPUTE_CLUSTER_NAME) if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: print('found compute target. just use it. ' + compute_name) else: print('creating a new compute target...') compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", AML_COMPUTE_CLUSTER_MIN_NODES) compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", AML_COMPUTE_CLUSTER_MAX_NODES) # This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6 vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", AML_COMPUTE_CLUSTER_SKU) provisioning_config = AmlCompute.provisioning_configuration( vm_size=vm_size, min_nodes=compute_min_nodes, max_nodes=compute_max_nodes) # create the cluster compute_target = ComputeTarget.create(ws, compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it will use the scale settings for the cluster compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # For a more detailed view of current AmlCompute status, use get_status() print(compute_target.get_status().serialize()) try: datastore = Datastore(ws, name=blobname) print("Found Blob Datastore with name: %s" % datastore) except: print("No datastore with name: %s" % blobname) sys.exit(-1) try: data_factory = DataFactoryCompute(ws, data_factory_name) print('data_factory ', data_factory) except ComputeTargetException as e: if 'ComputeTargetNotFound' in e.message: print('Data factory Compute not found, creating...') provisioning_config = DataFactoryCompute.provisioning_configuration( ) data_factory = ComputeTarget.create(ws, data_factory_name, provisioning_config) data_factory.wait_for_completion() else: print('Data factory Compute not found, Entering Else Section...') raise e return datastore, compute_target, ws, data_factory
lambda x: x.created_time == max(model.created_time for model in model_list), model_list, ) ) model = Model(ws, name=production_model.name) model service = Model.deploy_from_model(workspace=ws, name=service_name, models=[model], inference_config=inference_config, deployment_config=aci_config) service.wait_for_deployment(show_output=True) aci_webservice = {} aci_webservice["aci_name"] = service.name aci_webservice["aci_url"] = service.scoring_uri with open(args.aci_store+"/aci_webservice.json", "w") as outfile: json.dump(aci_webservice, outfile) # Get the blob storage associated with the workspace pipeline_datastore = Datastore(ws, "datastore_pipeline") #Upload production model to main blob folder pipeline_datastore.upload_files([args.aci_store+"/aci_webservice.json"], target_path="webservice"+'/'+args.repo_owner+'/'+args.repo_name, overwrite=True) print("Deployed ACI Webservice: {} \nWebservice Uri: {}".format(service.name, service.scoring_uri)
import argparse from pathlib import Path from azureml.core.datastore import Datastore from azureml.core.workspace import Workspace if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--subscription-id", type=str) parser.add_argument("--resource-group", type=str) parser.add_argument("--workspace-name", type=str) parser.add_argument("--datastore-name", type=str) parser.add_argument("--data-directory", type=str) parser.add_argument("--dataset-name", type=str) args = parser.parse_args() print(args.workspace_name) workspace = Workspace( subscription_id=args.subscription_id, resource_group=args.resource_group, workspace_name=args.workspace_name, ) datastore = Datastore.get(workspace, args.datastore_name) local_path = Path(args.data_directory) for phase in ["train", "val"]: local_directory = str(local_path / phase) target_path = str(Path(args.dataset_name) / phase) datastore.upload(local_directory, target_path=target_path, show_progress=True)
cv_results['n_features'] = X.shape[1] cv_results['y_0'] = y.tolist().count(0) cv_results['y_1'] = y.tolist().count(1) print(cv_results["mean_test_pr_auc"].to_string(index=False)) run.log(name="mean_test_pr_auc", value=cv_results["mean_test_pr_auc"].to_string(index=False)) if not os.path.isdir(args.train_model): os.makedirs(args.train_model, exist_ok=True) timestamp_id = datetime.datetime.now() time = timestamp_id.strftime("%m-%d-%Y_%H%M") model_name = "{}__{}.json".format(args.repo_owner, args.repo_name) output_path = os.path.join(args.train_model, model_name) with open(output_path, 'w') as outfile: cv_results.to_json(outfile, orient='table', index=False) # Get the blob storage associated with the workspace pipeline_datastore = Datastore(ws, "datastore_pipeline") #Upload production model to main blob folder pipeline_datastore.upload_files([args.train_model + '/' + model_name], target_path="train_model" + '/' + args.repo_owner + '/' + args.repo_name + '/' + time, overwrite=True) print("Model is trained!")
from azureml.core import Workspace ws = Workspace.from_config() from azureml.core.datastore import Datastore batchscore_blob = Datastore.register_azure_blob_container( ws, datastore_name="images_datastore", container_name="sampledata", account_name="pipelinedata", overwrite=True) def_data_store = ws.get_default_datastore() from azureml.core.dataset import Dataset from azureml.pipeline.core import PipelineData input_images = Dataset.File.from_files( (batchscore_blob, "batchscoring/images/")) label_ds = Dataset.File.from_files((batchscore_blob, "batchscoring/labels/")) output_dir = PipelineData(name="scores", datastore=def_data_store, output_path_on_compute="batchscoring/results") input_images = input_images.register(workspace=ws, name="input_images") label_ds = label_ds.register(workspace=ws, name="label_ds") from azureml.core.model import Model model = Model(ws, 'tf-dnn-mnist')
ws = Workspace.create(subscription_id=azureSubscriptionID, resource_group=azureResourceGroup, name=azureMLWorkSpaceName, location=azureMLWorkSpaceLocation) else: ws = Workspace.get(azureMLWorkSpaceName, subscription_id=azureSubscriptionID) # create or use an existing experiment exp = Experiment(workspace=ws, name=experiment_name) # register our existing Azure Blob Container with the labled audio files ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name=azureStorageTargetContainer, container_name=azureStorageTargetContainer, account_name=azureStorgeAccountName, account_key=azureStorageKeyName, create_if_not_exists=False) # create a reference where we mount the DataStore to the container instance dr = DataReferenceConfiguration(datastore_name=ds.name, path_on_compute='data', mode='mount') # upload any needed files ws.get_default_datastore().upload(src_dir='.', target_path='.', overwrite=True, show_progress=True)
if 'creditcard' not in ws.datasets: #Set blobdatastore blob_datastore_name = 'MyBlobDatastore' account_name = os.getenv( "BLOB_ACCOUNTNAME_62", "PUT YOUR STORAGE ACCOUNT NAME HERE") # Storage account name container_name = os.getenv( "BLOB_CONTAINER_62", "PUT YOUR STORAGE CONTAINER NAME HERE") # Name of Azure blob container account_key = os.getenv( "BLOB_ACCOUNT_KEY_62", "PUT YOUR STORAGE ACCOUNT KEY HERE") # Storage account key try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) blob_data_ref = DataReference(datastore=blob_datastore, data_reference_name="blob_test_data", path_on_datastore="testdata") csv_path = (blob_datastore, '/creditcard.csv')
try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target') except ComputeTargetException: print('No compute cluster named {}'.format(cluster_name)) exit() curated_env_name = 'Resnet50v15-CPU-cluster' # pytorch_env = Environment.get(workspace=ws, name=curated_env_name) pytorch_env = Environment.from_conda_specification( name=curated_env_name, file_path='./conda_dependencies.yml') project_folder = './' data_path = 'datasets' datastore = Datastore.get(ws, 'workspaceblobstore') dataset = Dataset.File.from_files(path=(datastore, data_path)) data_loc = dataset.as_named_input('input').as_mount() src = ScriptRunConfig( source_directory=project_folder, # command=['ls'], script='train_resnet.py', arguments=[ '--num_epochs', 16, '--batch', '32', '--shuffle', 'True', '--dataloc',
workspace_name="<your workspace name>" if subscription_id.startswith("<"): raise ex else: # write and reload from config file config = {"Scope": "/subscriptions/" + subscription_id + "/resourceGroups/" + resource_group + "/providers/Microsoft.MachineLearningServices/workspaces/" + workspace_name +"/projects/samples"} import json import os os.makedirs(os.path.dirname(config_path), exist_ok=True) with open(config_path, "w") as fo: fo.write(json.dumps(config)) ws = Workspace.from_config(path=config_path) from azureml.core.datastore import Datastore relevance_datastore = Datastore(ws, 'adls_relevance09') # Retrieve or create the computer target from azureml.core.compute import AmlCompute, ComputeTarget from azureml.core.compute_target import ComputeTargetException cluster_name = "Cmpt-112GB-16Cr" # cluster_name = "Cmpt-512GB-64Cr" if cluster_name not in ws.compute_targets: print('Creating a new compute target...') compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2", max_nodes=4) compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)
ws = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, auth=service_principal) return (ws) if __name__ == '__main__': global run run = Run.get_context() auth_params = get_args() ws = get_ws(auth_params) datastore_names = list(ws.datastores.keys()) def_data_store = ws.get_default_datastore() def_blob_store = Datastore(ws, "workspaceblobstore") data_temp_folder = os.path.join(cwd, "data_temp") create_folders([data_temp_folder]) dataset = { 'dataset': "https://github.com/rouzbeh-afrasiabi/PublicDatasets/raw/master/train.csv.zip" } word_vectors = { "en_vectors_web_lg": "https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz" } toDownload = [dataset, word_vectors] download_files(toDownload, data_temp_folder)
def create_DDoS_datasets(ws): dtypes = { 'Src IP': 'category', 'Src Port': 'uint16', 'Dst IP': 'category', 'Dst Port': 'uint16', 'Protocol': 'category', 'Flow Duration': 'uint32', 'Tot Fwd Pkts': 'uint32', 'Tot Bwd Pkts': 'uint32', 'TotLen Fwd Pkts': 'float32', 'TotLen Bwd Pkts': 'float32', 'Fwd Pkt Len Max': 'float32', 'Fwd Pkt Len Min': 'float32', 'Fwd Pkt Len Mean': 'float32', 'Fwd Pkt Len Std': 'float32', 'Bwd Pkt Len Max': 'float32', 'Bwd Pkt Len Min': 'float32', 'Bwd Pkt Len Mean': 'float32', 'Bwd Pkt Len Std': 'float32', 'Flow Byts/s': 'float32', 'Flow Pkts/s': 'float32', 'Flow IAT Mean': 'float32', 'Flow IAT Std': 'float32', 'Flow IAT Max': 'float32', 'Flow IAT Min': 'float32', 'Fwd IAT Tot': 'float32', 'Fwd IAT Mean': 'float32', 'Fwd IAT Std': 'float32', 'Fwd IAT Max': 'float32', 'Fwd IAT Min': 'float32', 'Bwd IAT Tot': 'float32', 'Bwd IAT Mean': 'float32', 'Bwd IAT Std': 'float32', 'Bwd IAT Max': 'float32', 'Bwd IAT Min': 'float32', 'Fwd PSH Flags': 'category', 'Bwd PSH Flags': 'category', 'Fwd URG Flags': 'category', 'Bwd URG Flags': 'category', 'Fwd Header Len': 'uint32', 'Bwd Header Len': 'uint32', 'Fwd Pkts/s': 'float32', 'Bwd Pkts/s': 'float32', 'Pkt Len Min': 'float32', 'Pkt Len Max': 'float32', 'Pkt Len Mean': 'float32', 'Pkt Len Std': 'float32', 'Pkt Len Var': 'float32', 'FIN Flag Cnt': 'category', 'SYN Flag Cnt': 'category', 'RST Flag Cnt': 'category', 'PSH Flag Cnt': 'category', 'ACK Flag Cnt': 'category', 'URG Flag Cnt': 'category', 'CWE Flag Count': 'category', 'ECE Flag Cnt': 'category', 'Down/Up Ratio': 'float32', 'Pkt Size Avg': 'float32', 'Fwd Seg Size Avg': 'float32', 'Bwd Seg Size Avg': 'float32', 'Fwd Byts/b Avg': 'uint32', 'Fwd Pkts/b Avg': 'uint32', 'Fwd Blk Rate Avg': 'uint32', 'Bwd Byts/b Avg': 'uint32', 'Bwd Pkts/b Avg': 'uint32', 'Bwd Blk Rate Avg': 'uint32', 'Subflow Fwd Pkts': 'uint32', 'Subflow Fwd Byts': 'uint32', 'Subflow Bwd Pkts': 'uint32', 'Subflow Bwd Byts': 'uint32', 'Init Fwd Win Byts': 'uint32', 'Init Bwd Win Byts': 'uint32', 'Fwd Act Data Pkts': 'uint32', 'Fwd Seg Size Min': 'uint32', 'Active Mean': 'float32', 'Active Std': 'float32', 'Active Max': 'float32', 'Active Min': 'float32', 'Idle Mean': 'float32', 'Idle Std': 'float32', 'Idle Max': 'float32', 'Idle Min': 'float32', 'Label': 'category' } data = pd.read_csv( './final_dataset.csv', parse_dates=['Timestamp'], usecols=[*dtypes.keys(), 'Timestamp'], engine='c', low_memory=True, na_values=np.inf ) # There are over 12 million rows in this orignal dataset. For this project, that much data is taking far too long, so I'm randomly sampling only .5% of the data data = data.sample(frac=0.005) # Register Base Dataset in Workspace datastore = Datastore(ws) name = "DDoS Dataset" description_text = "DDoS DataSet for Udacity Capstone Project" dataset = TabularDatasetFactory.register_pandas_dataframe(data, datastore, name, description=description_text) # Clean dataset and register the clean version cleaned_data = clean_data(data) clean_dataset_name = "Clean DDoS Dataset" clean_description_text = description_text + " that has been cleaned" clean_dataset = TabularDatasetFactory.register_pandas_dataframe(cleaned_data, datastore, clean_dataset_name, description=clean_description_text)