def supported_vm_sizes(ws): """ Get vm sizes available for your region :param ws: azureml Workspace instance :return: list """ return [size for size in AmlCompute.supported_vmsizes(workspace=ws)]
def __check_compute_target(self, compute_target, use_gpu: bool): __vm_size = '' if isinstance(compute_target, AmlCompute): __vm_size = compute_target.vm_size elif isinstance(compute_target, str): compute = ComputeTarget(workspace=self.__workspace, name=compute_target) __vm_size = compute.vm_size if self.__vm_size_list is None: self.__vm_size_list = AmlCompute.supported_vmsizes( self.__workspace) vm_description = list( filter( lambda vmsize: str.upper(vmsize['name']) == str.upper( __vm_size), self.__vm_size_list))[0] if (use_gpu and vm_description['gpus'] == 0): raise errors.TrainingComputeException( f'gpu_compute was specified, but the target does not have GPUs: {vm_description} ' ) if (not (use_gpu) and vm_description['vCPUs'] == 0): raise errors.TrainingComputeException( f'cpu_compute was specified, but the target does not have CPUs: {vm_description} ' )
def get_aml_ws_sizes(self, aml_ws_name): ws = get_aml_ws(self.config, aml_ws_name) # TODO: make this an xt cmd: xt list sizes from azureml.core.compute import ComputeTarget, AmlCompute sizes = AmlCompute.supported_vmsizes(workspace=ws) # for size in sizes: # if size["gpus"] > 0: # console.print(size) return sizes
def show_available_vm_sizes(ctx): """ Show, which VM Sizes are available in the workspace's Azure region """ ws = Workspace.from_config() pprint(AmlCompute.supported_vmsizes(workspace=ws)) print( "\n>>> For VM prices, see https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/ <<<\n" )
def __init__( self, workspace, compute_target=None, environment_definition=None, experiment_name=None, initial_node_count=None, jupyter=None, jupyter_port=None, dashboard_port=None, scheduler_port=None, scheduler_idle_timeout=None, worker_death_timeout=None, additional_ports=None, admin_username=None, admin_ssh_key=None, datastores=None, code_store=None, vnet_resource_group=None, vnet=None, subnet=None, show_output=False, telemetry_opt_out=None, asynchronous=False, **kwargs, ): ### REQUIRED PARAMETERS self.workspace = workspace self.compute_target = compute_target ### ENVIRONMENT self.environment_definition = environment_definition ### EXPERIMENT DEFINITION self.experiment_name = experiment_name self.tags = {"tag": "azureml-dask"} ### ENVIRONMENT AND VARIABLES self.initial_node_count = initial_node_count ### SEND TELEMETRY self.telemetry_opt_out = telemetry_opt_out self.telemetry_set = False ### FUTURE EXTENSIONS self.kwargs = kwargs self.show_output = show_output ## CREATE COMPUTE TARGET self.admin_username = admin_username self.admin_ssh_key = admin_ssh_key self.vnet_resource_group = vnet_resource_group self.vnet = vnet self.subnet = subnet self.compute_target_set = True self.pub_key_file = "" self.pri_key_file = "" if self.compute_target is None: try: self.compute_target = self.__create_compute_target() self.compute_target_set = False except Exception as e: logger.exception(e) return elif self.compute_target.admin_user_ssh_key is not None and ( self.admin_ssh_key is None or self.admin_username is None ): logger.exception( "Please provide private key and admin username to access compute target {}".format( self.compute_target.name ) ) return ### GPU RUN INFO self.workspace_vm_sizes = AmlCompute.supported_vmsizes(self.workspace) self.workspace_vm_sizes = [ (e["name"].lower(), e["gpus"]) for e in self.workspace_vm_sizes ] self.workspace_vm_sizes = dict(self.workspace_vm_sizes) self.compute_target_vm_size = self.compute_target.serialize()["properties"][ "status" ]["vmSize"].lower() self.n_gpus_per_node = self.workspace_vm_sizes[self.compute_target_vm_size] self.use_gpu = True if self.n_gpus_per_node > 0 else False if self.environment_definition is None: if self.use_gpu: self.environment_definition = self.workspace.environments[ "AzureML-Dask-GPU" ] else: self.environment_definition = self.workspace.environments[ "AzureML-Dask-CPU" ] ### JUPYTER AND PORT FORWARDING self.jupyter = jupyter self.jupyter_port = jupyter_port self.dashboard_port = dashboard_port self.scheduler_port = scheduler_port self.scheduler_idle_timeout = scheduler_idle_timeout self.portforward_proc = None self.worker_death_timeout = worker_death_timeout self.end_logging = False # FLAG FOR STOPPING THE port_forward_logger THREAD if additional_ports is not None: if type(additional_ports) != list: error_message = ( f"The additional_ports parameter is of {type(additional_ports)}" " type but needs to be a list of int tuples." " Check the documentation." ) logger.exception(error_message) raise TypeError(error_message) if len(additional_ports) > 0: if type(additional_ports[0]) != tuple: error_message = ( f"The additional_ports elements are of {type(additional_ports[0])}" " type but needs to be a list of int tuples." " Check the documentation." ) raise TypeError(error_message) ### check if all elements are tuples of length two and int type all_correct = True for el in additional_ports: if type(el) != tuple or len(el) != 2: all_correct = False break if (type(el[0]), type(el[1])) != (int, int): all_correct = False break if not all_correct: error_message = ( "At least one of the elements of the additional_ports parameter" " is wrong. Make sure it is a list of int tuples." " Check the documentation." ) raise TypeError(error_message) self.additional_ports = additional_ports self.scheduler_ip_port = ( None ### INIT FOR HOLDING THE ADDRESS FOR THE SCHEDULER ) ### DATASTORES self.datastores = datastores ### RUNNING IN MATRIX OR LOCAL self.same_vnet = None self.is_in_ci = False ### GET RUNNING LOOP self._loop_runner = LoopRunner(loop=None, asynchronous=asynchronous) self.loop = self._loop_runner.loop self.abs_path = pathlib.Path(__file__).parent.absolute() ### INITIALIZE CLUSTER super().__init__(asynchronous=asynchronous) if not self.asynchronous: self._loop_runner.start() self.sync(self.__get_defaults) if not self.telemetry_opt_out: self.__append_telemetry() self.sync(self.__create_cluster)
from azureml.core import ScriptRunConfig from azureml.core.runconfig import DEFAULT_CPU_IMAGE print("SDK version:", azureml.core.VERSION) ws = Workspace.from_config(auth=AzureCliAuthentication(), path=os.path.join( os.path.dirname(os.path.realpath(__file__)), 'config.json')) print(f'Workspace: {ws.name}') experiment_name = 'train-bert-ner-on-amlcompute' experiment = Experiment(workspace=ws, name=experiment_name) supported_vms = AmlCompute.supported_vmsizes(workspace=ws) # print(supported_vms) project_folder = './ner' bert_env = Environment("bert_aml_env") conda_dep = CondaDependencies() conda_dep.set_python_version('3.7.3') conda_dep.add_pip_package("torch") conda_dep.add_pip_package("adal") conda_dep.add_pip_package("cloudpickle") conda_dep.add_pip_package("docker") conda_dep.add_pip_package("numpy") conda_dep.add_pip_package("scipy") conda_dep.add_pip_package("tokenizers")
def __init__( self, workspace, compute_target, environment_definition, experiment_name=None, initial_node_count=None, jupyter=None, jupyter_port=None, dashboard_port=None, scheduler_port=None, scheduler_idle_timeout=None, worker_death_timeout=None, additional_ports=None, admin_username=None, admin_ssh_key=None, datastores=None, code_store=None, asynchronous=False, **kwargs, ): ### REQUIRED PARAMETERS self.workspace = workspace self.compute_target = compute_target self.environment_definition = environment_definition ### EXPERIMENT DEFINITION self.experiment_name = experiment_name ### ENVIRONMENT AND VARIABLES self.initial_node_count = initial_node_count ### GPU RUN INFO self.workspace_vm_sizes = AmlCompute.supported_vmsizes(self.workspace) self.workspace_vm_sizes = [ (e["name"].lower(), e["gpus"]) for e in self.workspace_vm_sizes ] self.workspace_vm_sizes = dict(self.workspace_vm_sizes) self.compute_target_vm_size = self.compute_target.serialize()["properties"][ "status" ]["vmSize"].lower() self.n_gpus_per_node = self.workspace_vm_sizes[self.compute_target_vm_size] self.use_gpu = True if self.n_gpus_per_node > 0 else False ### JUPYTER AND PORT FORWARDING self.jupyter = jupyter self.jupyter_port = jupyter_port self.dashboard_port = dashboard_port self.scheduler_port = scheduler_port self.scheduler_idle_timeout = scheduler_idle_timeout self.worker_death_timeout = worker_death_timeout if additional_ports is not None: if type(additional_ports) != list: error_message = ( f"The additional_ports parameter is of {type(additional_ports)}" " type but needs to be a list of int tuples." " Check the documentation." ) logger.exception(error_message) raise TypeError(error_message) if len(additional_ports) > 0: if type(additional_ports[0]) != tuple: error_message = ( f"The additional_ports elements are of {type(additional_ports[0])}" " type but needs to be a list of int tuples." " Check the documentation." ) raise TypeError(error_message) ### check if all elements are tuples of length two and int type all_correct = True for el in additional_ports: if type(el) != tuple or len(el) != 2: all_correct = False break if (type(el[0]), type(el[1])) != (int, int): all_correct = False break if not all_correct: error_message = ( f"At least one of the elements of the additional_ports parameter" " is wrong. Make sure it is a list of int tuples." " Check the documentation." ) raise TypeError(error_message) self.additional_ports = additional_ports self.admin_username = admin_username self.admin_ssh_key = admin_ssh_key self.scheduler_ip_port = ( None ### INIT FOR HOLDING THE ADDRESS FOR THE SCHEDULER ) ### DATASTORES self.datastores = datastores ### FUTURE EXTENSIONS self.kwargs = kwargs ### RUNNING IN MATRIX OR LOCAL self.same_vnet = None ### GET RUNNING LOOP self._loop_runner = LoopRunner(loop=None, asynchronous=asynchronous) self.loop = self._loop_runner.loop self.abs_path = pathlib.Path(__file__).parent.absolute() ### INITIALIZE CLUSTER super().__init__(asynchronous=asynchronous) if not self.asynchronous: self._loop_runner.start() self.sync(self.__get_defaults) self.sync(self.__create_cluster)
def __init__( self, workspace, compute_target, environment_definition, experiment_name=None, initial_node_count=None, jupyter=None, jupyter_port=None, additional_ports=None, admin_username=None, admin_ssh_key=None, telemetry_opt_out=None, **kwargs, ): ### REQUIRED PARAMETERS self.workspace = workspace self.compute_target = compute_target self.environment_definition = environment_definition ### EXPERIMENT DEFINITION self.experiment_name = experiment_name self.tags = {"tag": "azureml-ngc-tools"} ### ENVIRONMENT AND VARIABLES self.initial_node_count = initial_node_count ### SEND TELEMETRY self.telemetry_opt_out = telemetry_opt_out self.telemetry_set = False ### GPU RUN INFO self.workspace_vm_sizes = AmlCompute.supported_vmsizes(self.workspace) self.workspace_vm_sizes = [(e["name"].lower(), e["gpus"]) for e in self.workspace_vm_sizes] self.workspace_vm_sizes = dict(self.workspace_vm_sizes) self.compute_target_vm_size = self.compute_target.serialize( )["properties"]["status"]["vmSize"].lower() self.n_gpus_per_node = self.workspace_vm_sizes[ self.compute_target_vm_size] self.use_gpu = True if self.n_gpus_per_node > 0 else False ### JUPYTER AND PORT FORWARDING self.jupyter = jupyter self.jupyter_port = jupyter_port self.portforward_proc = None self.end_logging = False # FLAG FOR STOPPING THE port_forward_logger THREAD if additional_ports is not None: if type(additional_ports) != list: error_message = ( f"The additional_ports parameter is of {type(additional_ports)}" " type but needs to be a list of int tuples." " Check the documentation.") logger.exception(error_message) raise TypeError(error_message) if len(additional_ports) > 0: if type(additional_ports[0]) != tuple: error_message = ( f"The additional_ports elements are of {type(additional_ports[0])}" " type but needs to be a list of int tuples." " Check the documentation.") raise TypeError(error_message) ### check if all elements are tuples of length two and int type all_correct = True for el in additional_ports: if type(el) != tuple or len(el) != 2: all_correct = False break if (type(el[0]), type(el[1])) != (int, int): all_correct = False break if not all_correct: error_message = ( f"At least one of the elements of the additional_ports parameter" " is wrong. Make sure it is a list of int tuples." " Check the documentation.") raise TypeError(error_message) self.additional_ports = [] if additional_ports is None else additional_ports self.admin_username = admin_username self.admin_ssh_key = admin_ssh_key ### FUTURE EXTENSIONS self.kwargs = kwargs ### ABSOLUTE PATH self.abs_path = pathlib.Path(__file__).parent.absolute() # ### close the cluster handler # signal.signal(signal.SIGINT, self.__signal_handler) ### define script parameters self.script_params = {} self.script_params["--use_gpu"] = self.use_gpu self.script_params["--n_gpus_per_node"] = self.n_gpus_per_node ### headnode info self.headnode_info = {} if not self.telemetry_opt_out: self.__append_telemetry() self.__create_cluster() self.__print_message("Cluster created...")
from azureml.core import ScriptRunConfig from azureml.core.conda_dependencies import CondaDependencies subscription_id = "0000-00000000-00000000-0000" resource_group = "AdvanceAnalytics.ML" workspace_name = "aa-ml-aml-workspace" workspace_region = 'eastus' computetarget_vm = 'Standard_NC6' ws = Workspace.create(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, location=workspace_region, exist_ok=True) AmlCompute.supported_vmsizes(ws) # Create a new runconfig object run_config = RunConfiguration() run_config.target = "amlcompute" run_config.amlcompute.vm_size = computetarget_vm run_config.framework = 'python' run_config.environment.docker.base_image = 'tensorflow/tensorflow:1.6.0-gpu' run_config.environment.docker.gpu_support = True run_config.environment.python.conda_dependencies = CondaDependencies.create( conda_packages=[ 'opencv==3.4.1', 'urllib3', 'tqdm', 'scikit-learn', 'pandas', 'tensorflow-gpu', 'keras-gpu' ]) working_dir = r'.\keras'
def get_available_vm_sizes_from_config(): ws = Workspace.from_config() vm_sizes = AmlCompute.supported_vmsizes(workspace=ws, location="francecentral") return vm_sizes
def start(login, app): login_config = ngccontent.get_config(login) app_config = ngccontent.get_config(app) ### WORKSPACE subscription_id = login_config["azureml_user"]["subscription_id"] resource_group = login_config["azureml_user"]["resource_group"] workspace_name = login_config["azureml_user"]["workspace_name"] try: ws = Workspace( workspace_name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, ) except ProjectSystemException: msg = f'\n\nThe workspace "{workspace_name}" does not exist. ' msg += f"Go to \n\n " msg += f"-->> https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace <<--\n\n" msg += f"and create the workspace first.\n\n\n" msg += f"Your current configuration: \n\n" msg += f"Workspace name: {workspace_name} \n" msg += f"Subscription id: {subscription_id} \n" msg += f"Resource group: {resource_group}\n\n" logger.exception(msg) raise Exception(msg) verify = f""" Subscription ID: {subscription_id} Resource Group: {resource_group} Workspace: {workspace_name}""" logger.info(verify) ### experiment name exp_name = login_config["aml_compute"]["exp_name"] ### azure ml names ct_name = login_config["aml_compute"]["ct_name"] vm_name = login_config["aml_compute"]["vm_name"].lower() vm_priority = login_config["aml_compute"]["vm_priority"] ### trust but verify verify = f""" Experiment name: {exp_name}""" logger.info(verify) ### GPU RUN INFO workspace_vm_sizes = AmlCompute.supported_vmsizes(ws) pascal_volta_pattern = pattern = re.compile( r"[a-z]+_nc[0-9]+[s]?_v[2,3]" ) ### matches NC-series v2 and v3 workspace_vm_sizes = [ (e["name"].lower(), e["gpus"]) for e in workspace_vm_sizes if pattern.match(e["name"].lower()) ] workspace_vm_sizes = dict(workspace_vm_sizes) ### GET COMPUTE TARGET if vm_name in workspace_vm_sizes: gpus_per_node = workspace_vm_sizes[vm_name] verify = f""" Compute target: {ct_name} VM Size: {vm_name} No of GPUs: {gpus_per_node} Priority: {vm_priority} """ logger.info(verify) ### get SSH keys ssh_key_pub, pri_key_file = get_ssh_keys() if ct_name not in ws.compute_targets: logger.warning(f"Compute target {ct_name} does not exist...") ct = createOrGetComputeTarget( ws, ct_name, vm_name, vm_priority, ssh_key_pub, login_config ) else: ct = ws.compute_targets[ct_name] if ct.provisioning_state == "Failed": logger.warning( f"Compute target {ct_name} found but provisioning_state is showing as 'failed'..." ) logger.warning(f"Deleting {ct_name} target and will attempt again...") logger.warning( f"If this fails again check that you have enough resources in your subscription..." ) ct.delete() time.sleep(5) ct = createOrGetComputeTarget( ws, ct_name, vm_name, vm_priority, ssh_key_pub, login_config ) else: logger.info(f" Using pre-existing compute target {ct_name}") else: logger.exception("Unsupported vm_size {vm_size}".format(vm_size=vm_name)) logger.exception("The specified vm size must be one of ...") for azure_gpu_vm_size in workspace_vm_sizes.keys(): logger.exception("... " + azure_gpu_vm_size) raise Exception( "{vm_size} does not have Pascal or above GPU Family".format(vm_size=vm_name) ) env = createOrGetEnvironment(ws, login_config, app_config) ### UPLOAD ADDITIONAL CONTENT IF NOT EXISTS for additional_content in app_config["additional_content"]["list"]: url = additional_content["url"] targetfile = additional_content["filename"] src_path = additional_content["localdirectory"] dest_path = additional_content["computedirectory"] if ( "source" in additional_content.keys() and additional_content["source"] == "github" ): ngccontent.clone_github_repo(url,"additional_content",src_path) else: if app_config["additional_content"]["download_content"]: ngccontent.download(url, "additional_content", targetfile) if ( app_config["additional_content"]["unzip_content"] and additional_content["zipped"] ): ngccontent.unzipFile(targetfile, "additional_content", src_path) if app_config["additional_content"]["upload_content"]: ngccontent.upload_data( ws, ws.get_default_datastore(), "additional_content/" + src_path, dest_path, ) if (login_config["aml_compute"]["max_nodes"]==1): amlcluster = AzureMLComputeCluster( workspace=ws, compute_target=ct, initial_node_count=1, experiment_name=login_config["aml_compute"]["exp_name"], environment_definition=env, jupyter_port=login_config["aml_compute"]["jupyter_port"], telemetry_opt_out=login_config["azureml_user"]["telemetry_opt_out"], admin_username=login_config["aml_compute"]["admin_name"], admin_ssh_key=pri_key_file, ) else: logger.info("Creating a Dask Cluster with {} nodes".format(login_config["aml_compute"]["max_nodes"])) amlcluster = AzureMLCluster( workspace=ws, compute_target=ct, initial_node_count=login_config["aml_compute"]["max_nodes"], experiment_name=login_config["aml_compute"]["exp_name"], environment_definition=env, jupyter_port=login_config["aml_compute"]["jupyter_port"], telemetry_opt_out=login_config["azureml_user"]["telemetry_opt_out"], admin_username=login_config["aml_compute"]["admin_name"], admin_ssh_key=pri_key_file, ) logger.info(f"\n Go to: {amlcluster.jupyter_link}") logger.info(" Press Ctrl+C to stop the cluster.") try: while True: pass except KeyboardInterrupt: amlcluster.close()