def ComputeCompute(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] location = request.json['location'] cluster_name = request.json['cluster_name'] vm_size = request.json['vm_size'] min_nodes = request.json['min_nodes'] max_nodes = request.json['max_nodes'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') #aml_compute = AmlCompute(ws, cluster_name) #cluster_name = 'cpu-cluster' try: aml_compute = AmlCompute(ws, cluster_name) print('Found existing AML compute context.') return "Found existing AML compute context." except: print('need to create new Compute.') print('Creating new AML compute context.') aml_config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes) aml_compute = AmlCompute.create(ws, name=cluster_name, provisioning_configuration=aml_config) aml_compute.wait_for_completion(show_output=True) return "Compute successfully created"
def get_aml_compute(workspace): # TODO: Set desired name for compute target aml_compute_target = "example_vm_name" try: aml_compute = AmlCompute(workspace, aml_compute_target) print("found existing compute target.") except ComputeTargetException: print("creating new compute target") # TODO: Configure desired VM, see: https://docs.microsoft.com/nl-nl/azure/virtual-machines/sizes-general provisioning_config = AmlCompute.provisioning_configuration( vm_size="STANDARD_D1_V2", min_nodes=0, max_nodes=1, vnet_resourcegroup_name="", vnet_name="-vn", subnet_name="default", idle_seconds_before_scaledown=1800, vm_priority='lowpriority') aml_compute = ComputeTarget.create(workspace, aml_compute_target, provisioning_config) aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) print("Azure Machine Learning Compute attached") return aml_compute
def choose_compute_target(workspace, name): try: aml_compute = AmlCompute(workspace, name) print("Found existing compute target: {}".format(name)) except: print("Creating new compute target: {}".format(name)) provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", min_nodes=1, max_nodes=4) aml_compute = ComputeTarget.create(workspace, name, provisioning_config) aml_compute.wait_for_completion(show_output=True) print(aml_compute) return aml_compute
def setup(num): workspace_name = '%s-%s-%02d' % (workspace_prefix, location, num) try: ws = Workspace.get( name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) print('Found existing workspace %s' % workspace_name) except WorkspaceException: print('Creating new workspace %s...' % workspace_name) ws = Workspace.create( name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, location=location) try: compute_target = AmlCompute(ws, compute_name) print('Found existing compute %s' % compute_name) compute_target.update(min_nodes=min_nodes, max_nodes=max_nodes) except ComputeTargetException: print('Creating new compute target %s...' % compute_name) compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes) compute_target = ComputeTarget.create(ws, compute_name, compute_config) compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20) ds = ws.get_default_datastore() ds.upload("testdata") dataset_name = 'sample_dataset' if dataset_name not in ws.datasets: data = Dataset.File.from_files(path=[(ds, 'testdata.txt')]) data.register( workspace = ws, name = dataset_name, description = 'Sample data for load test') print('Dataset successfully registered') else: print('Dataset already exists')
from azureml.core import Workspace from azureml.core.compute import ComputeTarget, AmlCompute from azureml.core.compute_target import ComputeTargetException ws = Workspace.get(name='akws', subscription_id='8b3748c0-bb0b-4913-ab5b-c462062118fe', resource_group='akrg') cpu_cluster_name = 'tdsp-cluster' #verify that cluster does not exist try: cpu_cluster = AmlCompute(workspace=ws, name=cpu_cluster_name) print('Cluster already exists.') except ComputeTargetException: compute_config = AmlCompute.provisioning_configuration( vm_size='Standard_NC6', max_nodes=4) cpu_cluster = AmlCompute.create(ws, cpu_cluster_name, compute_config) cpu_cluster.wait_for_completion(show_output=True)
) parser.add_argument("--path", type=str, help="path", dest="path", required=True) args = parser.parse_args() print("Argument 1: %s" % args.aml_compute_target) print("Argument 2: %s" % args.path) print("creating AzureCliAuthentication...") cli_auth = AzureCliAuthentication() print("done creating AzureCliAuthentication!") print("get workspace...") ws = Workspace.from_config(path=args.path, auth=cli_auth) print("done getting workspace!") try: aml_compute = AmlCompute(ws, args.aml_compute_target) print("found existing compute target.") except ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration( vm_size="STANDARD_DS4_V2", min_nodes=1, max_nodes=8 ) aml_compute = ComputeTarget.create(ws, args.aml_compute_target, provisioning_config) aml_compute.wait_for_completion( show_output=True, min_node_count=None, timeout_in_minutes=20 ) print("Aml Compute attached")
cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except: # ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6',
# choose a name for your cluster cluster_name = "gpucluster" try: compute_target = AmlCompute(workspace=ws, name=cluster_name) print('Found existing compute target.') except ComputeTargetException: print('Creating a new compute target...') compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=4, idle_seconds_before_scaledown=1800) # create the cluster compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True) # use get_status() to get a detailed status for the current cluster. print(compute_target.get_status().serialize()) # Create a project directory # Create a directory that will contain all the necessary code from your # local machine that you will need access to on the remote resource. This # includes the training script and any additional files your training script # depends on. import os project_folder = './pytorch-hymenoptera'
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
outputDf.T from azureml.core.compute import AmlCompute aml_name = 'cpu-cluster' try: aml_compute = AmlCompute(ws, aml_name) print('Found existing AML compute context.') except: print('Creating new AML compute context.') aml_config = AmlCompute.provisioning_configuration( vm_size="Standard_D2_v2", min_nodes=1, max_nodes=4) aml_compute = AmlCompute.create(ws, name=aml_name, provisioning_configuration=aml_config) aml_compute.wait_for_completion(show_output=True) #writefile get_data.py from sklearn import datasets from sklearn.model_selection import train_test_split from scipy import sparse import numpy as np #def get_data(): boston = pd.read_csv( 'C:\\Users\\datacore\\OneDrive\\Desktop\\Capstone Project\\train_values_wJZrCmI.csv' ) X = boston.drop(columns=['poverty_probability']) y = boston['poverty_probability'] y = y.to_numpy()
try: batch_ai_compute = AmlCompute(workspace=ws, name=compute_target_name) print('found existing Azure Batch AI cluster:', batch_ai_compute.name) except ComputeTargetException: print('creating new Azure Batch AI cluster...') batch_ai_config = AmlCompute.provisioning_configuration( vm_size="Standard_NC6", vm_priority="dedicated", min_nodes=0, max_nodes=4, idle_seconds_before_scaledown=300) batch_ai_compute = AmlCompute.create( ws, name=compute_target_name, provisioning_configuration=batch_ai_config) batch_ai_compute.wait_for_completion(show_output=True) project_folder = './tmp/automl-remote-batchai' if not os.path.exists(project_folder): os.makedirs(project_folder) shutil.copy('./scripts/get_data.py', project_folder) print("Training the model...") # configure Auto ML automl_config = AutoMLConfig(task='classification', debug_log='automl_errors.log', primary_metric='AUC_weighted', iteration_timeout_minutes=2, iterations=20, n_cross_validations=5,
class ClusterConnector: def __init__( self, workspace, cluster_name, ssh_key, vm_type, admin_username="******", ): """Thin wrapper class around azureml.core.compute.AmlCluster Provides parallel ssh objects and helper for master node and all node commands and file copies. Usage: >>> cc = ClusterConnector(workspace, "MyCluster", sshkey, "Standard_ND40rs_v2") >>> cc.initialize(min_nodes=0, max_nodes=4, idle_timeout_secs=30) >>> cluster = cc.cluster >>> [print(node['name']) for node in cc.cluster.list_nodes()] """ self.cluster_name = cluster_name self.workspace = workspace self.ssh_key = ssh_key self.vm_type = vm_type self.admin_username = admin_username enable_host_logger() hlog = logging.getLogger("pssh.host_logger") tstr = datetime.now().isoformat(timespec="minutes") [ hlog.removeHandler(h) for h in hlog.handlers if isinstance(h, logging.StreamHandler) ] os.makedirs("clusterlogs", exist_ok=True) self.logfile = "clusterlogs/{}_{}.log".format(self.workspace.name, tstr) hlog.addHandler(logging.FileHandler(self.logfile)) self.cluster = None self._master_scp = None self._master_ssh = None self._all_ssh = None def initialise(self, min_nodes=0, max_nodes=0, idle_timeout_secs=1800): """Initialise underlying AmlCompute cluster instance""" self._create_or_update_cluster(min_nodes, max_nodes, idle_timeout_secs) def _check_logs_emessage(self, host, port): msg = "Remote command failed on {}:{}. For details see {}".format( host, port, self.logfile) return msg def terminate(self): print('Attempting to terminate cluster "{}"'.format( colored(self.cluster_name, "green"))) try: self.cluster.update(min_nodes=0, max_nodes=0, idle_seconds_before_scaledown=10) self.cluster.wait_for_completion() except ComputeTargetException as err: raise RuntimeError( "Failed to terminate cluster nodes ({})".format(err)) if len(self.cluster.list_nodes()): raise RuntimeError( "Failed to terminate cluster nodes (nodes still running)") @property def cluster_nodes(self): self.cluster.refresh_state() return sorted(self.cluster.list_nodes(), key=lambda n: n["port"]) def _create_or_update_cluster(self, min_nodes, max_nodes, idle_timeout_secs): try: self.cluster = AmlCompute(workspace=self.workspace, name=self.cluster_name) print('Updating existing cluster "{}"'.format( colored(self.cluster_name, "green"))) self.cluster.update( min_nodes=min_nodes, max_nodes=max_nodes, idle_seconds_before_scaledown=idle_timeout_secs, ) except ComputeTargetException: print('Creating new cluster "{}"'.format( colored(self.cluster_name, "green"))) cluster_config = AmlCompute.provisioning_configuration( vm_size=self.vm_type, min_nodes=min_nodes, max_nodes=max_nodes, idle_seconds_before_scaledown=idle_timeout_secs, admin_username=self.admin_username, admin_user_ssh_key=self.ssh_key, remote_login_port_public_access="Enabled", ) self.cluster = AmlCompute.create(self.workspace, self.cluster_name, cluster_config) self.cluster.wait_for_completion() if len(self.cluster_nodes) < min_nodes: sleep(30) if len(self.cluster_nodes) < min_nodes: raise RuntimeError("Failed to provision sufficient nodes") def _copy_nodefile_to_nodes(self): if len(self.cluster_nodes) == 1: cprint("Single node cluster -- skipping IB config", "yellow") return print("Collecting cluster IB info") outputs = self._all_ssh.run_command( r'ifconfig ib0 | grep -oe "inet[^6][adr: ]*[0-9.]*" | cut -d" " -f2', shell="bash -c", ) self._all_ssh.join(outputs) ibaddrs = [] for output in outputs: host = output.host port = output.client.port if output.exit_code != 0: print(list(output.stdout)) print(list(output.stderr)) raise RuntimeError("Failed to get IB ip for {}:{}".format( host, port)) try: ibaddr = list(output.stdout)[0].split()[0] except IndexError: raise RuntimeError("Failed to get IB ip for {}:{} - " "No ib interface found!".format(host, port)) print("Mapping {}:{} -> {}".format(host, port, ibaddr)) if port == self._master_scp.port: cprint("IB Master: {}".format(ibaddr), "green") ibaddrs = [ibaddr] + ibaddrs else: ibaddrs.append(ibaddr) with NamedTemporaryFile(delete=False, mode="wt") as nfh: self.nodefile = nfh.name for addr in ibaddrs: nfh.write("{}\n".format(addr)) self.ibaddrs = ibaddrs self.copy_to_all_nodes(self.nodefile, "./nodefile") def _create_cluster_ssh_conns(self): hostips = [n["publicIpAddress"] for n in self.cluster_nodes] hostconfigs = [HostConfig(port=n["port"]) for n in self.cluster_nodes] self._all_ssh = ParallelSSHClient(hostips, host_config=hostconfigs, user=self.admin_username) self._master_ssh = ParallelSSHClient(hostips[:1], host_config=hostconfigs[:1], user=self.admin_username) self._master_scp = SSHClient(hostips[0], port=hostconfigs[0].port, user=self.admin_username) def copy_to_all_nodes(self, source, dest): copy_jobs = self._all_ssh.copy_file(source, dest) joinall(copy_jobs, raise_error=True) def copy_to_master_node(self, source, dest): self._master_scp.copy_file(source, dest) def copy_from_master_node(self, source, dest): self._master_scp.copy_remote_file(source, dest) def run_on_all_nodes(self, command): outputs = self._all_ssh.run_command(command, shell="bash -c") self._all_ssh.join(outputs, consume_output=True) for output in outputs: if int(output.exit_code) != 0: host = output.host port = output.client.port raise RuntimeError(self._check_logs_emessage(host, port)) def run_on_master_node(self, command): outputs = self._master_ssh.run_command(command, shell="bash -c") self._master_ssh.join(outputs) for output in outputs: if int(output.exit_code) != 0: host = output.host port = output.client.port raise RuntimeError(self._check_logs_emessage(host, port)) def attempt_termination(self): try: self.terminate() except RuntimeError as err: print(colored("ERROR: {}\n\n", "red", attrs=["bold"]).format(err)) self.warn_unterminated() def warn_unterminated(self): print( colored("WARNING: {}", "red", attrs=["bold"]).format( colored( "Cluster {} is still running - terminate manually to avoid " "additional compute costs".format( colored(self.cluster_name, "green")), "red", )))
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) hostname = socket.gethostname() if hostname == 'wopauliNC6': base_dir = '.' else: base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except ComputeTargetException: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', max_nodes=5, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(gpu_compute_target.get_status().serialize()) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_packages=[ "azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2", "pillow==6.0.0" ]) gpu_cd = CondaDependencies.create(pip_packages=[ "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2" ]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd) gpu_compute_run_config.environment.docker.enabled = True gpu_compute_run_config.environment.docker.gpu_support = True gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE gpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") video_data = DataReference(datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join( "prednet", "data", "video", dataset)) # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) print("DataReference object created") # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) print("video_decode created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep(name='prepare_data', script_name="data_preparation.py", arguments=[ "--input_data", raw_data, "--output_data", preprocessed_data ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) data_prep.run_after(video_decoding) print("data_prep created") est = TensorFlow(source_directory=script_folder, compute_target=gpu_compute_target, pip_packages=[ 'keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod', 'hickle' ], entry_script='train.py', use_gpu=True, node_count=1) ps = RandomParameterSampling({ '--batch_size': choice(2, 4, 8, 16), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2", "3"), '--transfer_learning': choice("True", "False") }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=20) hdc = HyperDriveRunConfig( estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, #100, max_concurrent_runs=5, #10, max_duration_minutes=60 * 6) hd_step = HyperDriveStep(name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution' ], inputs=[preprocessed_data], metrics_output=data_metrics, allow_reuse=True) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=gpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.']) registration_step.run_after(hd_step) pipeline = Pipeline( workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset pipeline.publish(name=pipeline_name) return pipeline_name
nodes = 4 dsvm_name = 'dsvmaml' try: dsvm_compute = AmlCompute(ws, dsvm_name) print('found existing dsvm.') except: print('creating new dsvm.') # Below is using a VM of SKU Standard_D2_v2 which is 2 core machine. You can check Azure virtual machines documentation for additional SKUs of VMs. dsvm_config = AmlCompute.provisioning_configuration(vm_size="Standard_NC6", max_nodes=nodes, min_nodes=0) dsvm_compute = AmlCompute.create(ws, name=dsvm_name, provisioning_configuration=dsvm_config) dsvm_compute.wait_for_completion(show_output=True) automl_settings = { "name": "AutoML_Demo_Experiment_{0}".format(time.time()), "iteration_timeout_minutes": 60, "iterations": 20, "n_cross_validations": 5, "primary_metric": 'accuracy', "preprocess": True, "verbosity": logging.INFO, "max_concurrent_iterations": nodes } ## note here that the project folder gets uploaded to our DSVM. ## therefore we must have any extra classes/files in there as well i.e. app_helper.py and app_config.conf automated_ml_config = AutoMLConfig(
def main(): # Loading input values print("::debug::Loading input values") parameters_file = os.environ.get("INPUT_PARAMETERSFILE", default="workspace.json") azure_credentials = os.environ.get("INPUT_AZURECREDENTIALS", default="{}") azure_credentials = json.loads(azure_credentials) # Loading parameters file print("::debug::Loading parameters file") parameters_file_path = os.path.join(".aml", parameters_file) try: with open(parameters_file_path) as f: parameters = json.load(f) except FileNotFoundError: print( f"::error::Could not find parameter file in {parameters_file_path}. Please provide a parameter file in your repository (e.g. .aml/workspace.json)." ) return # Loading Workspace sp_auth = ServicePrincipalAuthentication( tenant_id=azure_credentials.get("tenantId", ""), service_principal_id=azure_credentials.get("clientId", ""), service_principal_password=azure_credentials.get("clientSecret", "")) try: print("::debug::Loading existing Workspace") ws = Workspace.get( name=parameters.get("name", None), subscription_id=azure_credentials.get("subscriptionId", ""), resource_group=parameters.get("resourceGroup", None), auth=sp_auth) print("::debug::Successfully loaded existing Workspace") except AuthenticationException as exception: print( f"::error::Could not retrieve user token. Please paste output of `az ad sp create-for-rbac --name <your-sp-name> --role contributor --scopes /subscriptions/<your-subscriptionId>/resourceGroups/<your-rg> --sdk-auth` as value of secret variable: AZURE_CREDENTIALS: {exception}" ) return except AuthenticationError as exception: print(f"::error::Microsoft REST Authentication Error: {exception}") return except AdalError as exception: print( f"::error::Active Directory Authentication Library Error: {exception}" ) return except ProjectSystemException as exception: print(f"::error::Workspace authorizationfailed: {exception}") return # TODO: Create compute if not existing. try: # Loading AMLCompute print("::debug::Loading existing AML Compute") cluster = AmlCompute(workspace=ws, name=parameters["name"]) # Check settings and redeploy if required settings have changed print("::debug::Found existing cluster") if cluster.vm_size.lower() != parameters["vm_size"].lower( ) or cluster.vm_priority.lower() != parameters["vm_priority"].lower(): cluster.delete() cluster.wait_for_completion(show_output=True) raise ComputeTargetException( "Cluster is of incorrect size or has incorrect priority. Deleting cluster and provisioning a new one." ) # Update AMLCompute #if cluster.provisioning_configuration.min_nodes != aml_settings["min_nodes"] or cluster.provisioning_configuration.max_nodes != aml_settings["max_nodes"] or cluster.provisioning_configuration.idle_seconds_before_scaledown != aml_settings["idle_seconds_before_scaledown"]: print("::debug::Updating settings of Cluster") cluster.update(min_nodes=parameters["min_nodes"], max_nodes=parameters["max_nodes"], idle_seconds_before_scaledown=parameters[ "idle_seconds_before_scaledown"]) # Wait until the operation has completed cluster.wait_for_completion(show_output=True) print("::debug::Successfully updated Cluster definition") except ComputeTargetException: print("::debug::Loading failed") print("::debug::Creating new AML Compute resource") compute_config = AmlCompute.provisioning_configuration( vm_size=parameters["vm_size"], vm_priority=parameters["vm_priority"], min_nodes=parameters["min_nodes"], max_nodes=parameters["max_nodes"], idle_seconds_before_scaledown=parameters[ "idle_seconds_before_scaledown"], tags=parameters["tags"], description=parameters["description"]) # Deploy to VNET if provided if parameters["vnet_resource_group_name"] and parameters[ "vnet_name"] and parameters["subnet_name"]: compute_config.vnet_resourcegroup_name = parameters[ "vnet_resource_group_name"] compute_config.vnet_name = parameters["vnet_name"] compute_config.subnet_name = parameters["subnet_name"] # Set Credentials if provided if parameters["admin_username"] and parameters["admin_user_password"]: compute_config.admin_username = parameters["admin_username"] compute_config.admin_user_password = parameters[ "admin_user_password"] elif parameters["admin_username"] and parameters["admin_user_ssh_key"]: compute_config.admin_username = parameters["admin_username"] compute_config.admin_user_ssh_key = parameters[ "admin_user_ssh_key"] # Create Compute Target cluster = ComputeTarget.create( workspace=ws, name=parameters["name"], provisioning_configuration=compute_config) # Wait until the cluster is attached cluster.wait_for_completion(show_output=True) # Checking status of AMLCompute Cluster print("::debug::Checking status of AMLCompute Cluster") if cluster.provisioning_state == "Failed": cluster.delete() raise Exception( "::debug::Deployment of AMLCompute Cluster failed with the following status: {} and logs: \n{}" .format(cluster.provisioning_state, cluster.provisioning_errors)) print(parameters) print( "::debug::Successfully finished Azure Machine Learning Compute Action")